kilian-gebhardt · May 11, 2019 16:24
diff --git a/ingestion_helper.py b/ingestion_helper.py
 #!/bin/python3
 import bibtexparser
 import subprocess
 import sys, os

 OUTPUT_ROOT = sys.argv[1]
 ACL = "anthology/L/L16"
 OUTPUT_DIR = os.path.join(OUTPUT_ROOT, ACL)

 try:
    os.makedirs(OUTPUT_DIR)
 except OSError as exc:
    if exc.errno == errno.EEXIST and os.path.isdir(path):
        pass
    else:
        raise

 # manually copy the bibtex data from bibtex.html into bibtex.bib
 with open('bibtex.bib') as bib_file:
    bib_db = bibtexparser.load(bib_file)

 # build page number database and retrieve abstracts
 page_numbers = {}
 for key in bib_db.get_entry_dict().keys():
    lrec_id = key.split('.')[-1]
    output = subprocess.check_output(
            ['grep', '-a', '-E', "\([0-9]*\)Tj", "pdf/%s_Paper.pdf" % lrec_id])
    # workaround, the above regex does not work always
    if lrec_id == '859':
        output = '(4440)Tj\n(4441)Tj\n(4442)Tj\n(4443)Tj\n(4444)Tj'
    page_numbers[key] = pages =  [int(o[1:-3]) for o in output.split()]
    bib_db.get_entry_dict()[key]['pages'] = "%d--%d" % (min(pages), max(pages))

    # retrieve abstract
    proc = subprocess.Popen(["grep -C 1 '<td>Abstract</td>' summaries/%s.html | tail -n 1 | pandoc --from html --standalone -t latex --wrap=none | sed -n '/^\\\\begin{document}/,/^\\\\end{document}/p' | tail -n +3 | head -n -2" % lrec_id], stdout=subprocess.PIPE, shell=True)
    abstract = proc.communicate()[0].decode('utf-8')
    if abstract.isspace():
        print("lrec_id", abstract)
    bib_db.get_entry_dict()[key]['abstract'] = abstract


 from bibtexparser.bwriter import BibTexWriter
 writer = BibTexWriter()
 with open('/tmp/bibtex_with_abstracts.bib', 'w') as bibfile:
    bibfile.write(writer.write(bib_db))

 ordered_ids = sorted(bib_db.get_entry_dict().keys(), key=lambda k: min(page_numbers[k]))

 last = 0
 for n, key in enumerate(ordered_ids):
    acl_id = "L16-1%03d" % (n+1)
    print(acl_id, key, bib_db.get_entry_dict()[key]['pages'])
    lrec_id = key.split('.')[-1]
    assert min(page_numbers[key]) == last + 1
    last = max(page_numbers[key])
    goal = os.path.join(OUTPUT_DIR, "%s.pdf" % acl_id)
    subprocess.call("ln -s $PWD/pdf/%s_Paper.pdf %s" % (lrec_id, goal), shell=True)
    bib_entry = bib_db.get_entry_dict()[key]
    bib_entry['ID'] = acl_id
    bib_entry['url'] = acl_id
    bib_entry['month'] = 'May'
    # avoid warning during anthology generation?!
    # del bib_entry['editor']
    db = bibtexparser.bibdatabase.BibDatabase()
    db.entries = [bib_entry]
    with open (os.path.join(OUTPUT_DIR, "%s.bib" % acl_id), 'w') as bibfile:
        bibfile.write(writer.write(db))

 proceedings= {}
 proceedings['ID'] = 'L16-1000'
 proceedings['ENTRYTYPE'] = 'proceedings'
 proceedings['title'] = 'Proceedings of the Tenth International Conference on Language Resources and Evaluation ({LREC} 2016)'
 proceedings['address'] = 'Paris, France'
 proceedings['date'] = '23-28'
 proceedings['author'] = 'Nicoletta Calzolari and Khalid Choukri and Thierry Declerck and Sara Goggi and Marko Grobelnik and Bente Maegaard and Joseph Mariani and Helene Mazo and Asuncion Moreno and Jan Odijk and Stelios Piperidis'
 proceedings['isbn'] = '978-2-9517408-9-1'
 proceedings['language'] = 'english'
 proceedings['location'] = 'Portorož, Slovenia'
 proceedings['month'] = 'May'
 proceedings['publisher'] = 'European Language Resources Association (ELRA)'
 # TODO: there is no front matter pdf
 # proceedings['url'] = 'L16-1000'
 proceedings['year'] = '2016'
 bib_db.entries.append(proceedings)
 bib_db.entries_dict['L16-1000'] = proceedings
 db = bibtexparser.bibdatabase.BibDatabase()
 db.entries = [proceedings]
 with open (os.path.join(OUTPUT_DIR, "L16-1000.bib"), 'w') as bibfile:
    bibfile.write(writer.write(db))

 with open(os.path.join(OUTPUT_DIR, 'L16-1.bib'), 'w') as bibfile:
    bibfile.write(writer.write(bib_db))
	#!/bin/python3
	import bibtexparser
	import subprocess
	import sys, os

	OUTPUT_ROOT = sys.argv[1]
	ACL = "anthology/L/L16"
	OUTPUT_DIR = os.path.join(OUTPUT_ROOT, ACL)

	try:
	os.makedirs(OUTPUT_DIR)
	except OSError as exc:
	if exc.errno == errno.EEXIST and os.path.isdir(path):
	pass
	else:
	raise

	# manually copy the bibtex data from bibtex.html into bibtex.bib
	with open('bibtex.bib') as bib_file:
	bib_db = bibtexparser.load(bib_file)

	# build page number database and retrieve abstracts
	page_numbers = {}
	for key in bib_db.get_entry_dict().keys():
	lrec_id = key.split('.')[-1]
	output = subprocess.check_output(
	['grep', '-a', '-E', "\([0-9]*\)Tj", "pdf/%s_Paper.pdf" % lrec_id])
	# workaround, the above regex does not work always
	if lrec_id == '859':
	output = '(4440)Tj\n(4441)Tj\n(4442)Tj\n(4443)Tj\n(4444)Tj'
	page_numbers[key] = pages = [int(o[1:-3]) for o in output.split()]
	bib_db.get_entry_dict()[key]['pages'] = "%d--%d" % (min(pages), max(pages))

	# retrieve abstract
	proc = subprocess.Popen(["grep -C 1 '<td>Abstract</td>' summaries/%s.html \| tail -n 1 \| pandoc --from html --standalone -t latex --wrap=none \| sed -n '/^\\\\begin{document}/,/^\\\\end{document}/p' \| tail -n +3 \| head -n -2" % lrec_id], stdout=subprocess.PIPE, shell=True)
	abstract = proc.communicate()[0].decode('utf-8')
	if abstract.isspace():
	print("lrec_id", abstract)
	bib_db.get_entry_dict()[key]['abstract'] = abstract


	from bibtexparser.bwriter import BibTexWriter
	writer = BibTexWriter()
	with open('/tmp/bibtex_with_abstracts.bib', 'w') as bibfile:
	bibfile.write(writer.write(bib_db))

	ordered_ids = sorted(bib_db.get_entry_dict().keys(), key=lambda k: min(page_numbers[k]))

	last = 0
	for n, key in enumerate(ordered_ids):
	acl_id = "L16-1%03d" % (n+1)
	print(acl_id, key, bib_db.get_entry_dict()[key]['pages'])
	lrec_id = key.split('.')[-1]
	assert min(page_numbers[key]) == last + 1
	last = max(page_numbers[key])
	goal = os.path.join(OUTPUT_DIR, "%s.pdf" % acl_id)
	subprocess.call("ln -s $PWD/pdf/%s_Paper.pdf %s" % (lrec_id, goal), shell=True)
	bib_entry = bib_db.get_entry_dict()[key]
	bib_entry['ID'] = acl_id
	bib_entry['url'] = acl_id
	bib_entry['month'] = 'May'
	# avoid warning during anthology generation?!
	# del bib_entry['editor']
	db = bibtexparser.bibdatabase.BibDatabase()
	db.entries = [bib_entry]
	with open (os.path.join(OUTPUT_DIR, "%s.bib" % acl_id), 'w') as bibfile:
	bibfile.write(writer.write(db))

	proceedings= {}
	proceedings['ID'] = 'L16-1000'
	proceedings['ENTRYTYPE'] = 'proceedings'
	proceedings['title'] = 'Proceedings of the Tenth International Conference on Language Resources and Evaluation ({LREC} 2016)'
	proceedings['address'] = 'Paris, France'
	proceedings['date'] = '23-28'
	proceedings['author'] = 'Nicoletta Calzolari and Khalid Choukri and Thierry Declerck and Sara Goggi and Marko Grobelnik and Bente Maegaard and Joseph Mariani and Helene Mazo and Asuncion Moreno and Jan Odijk and Stelios Piperidis'
	proceedings['isbn'] = '978-2-9517408-9-1'
	proceedings['language'] = 'english'
	proceedings['location'] = 'Portorož, Slovenia'
	proceedings['month'] = 'May'
	proceedings['publisher'] = 'European Language Resources Association (ELRA)'
	# TODO: there is no front matter pdf
	# proceedings['url'] = 'L16-1000'
	proceedings['year'] = '2016'
	bib_db.entries.append(proceedings)
	bib_db.entries_dict['L16-1000'] = proceedings
	db = bibtexparser.bibdatabase.BibDatabase()
	db.entries = [proceedings]
	with open (os.path.join(OUTPUT_DIR, "L16-1000.bib"), 'w') as bibfile:
	bibfile.write(writer.write(db))

	with open(os.path.join(OUTPUT_DIR, 'L16-1.bib'), 'w') as bibfile:
	bibfile.write(writer.write(bib_db))
No results found