Skip to content

Instantly share code, notes, and snippets.

@kilian-gebhardt
Last active May 11, 2019 16:24
Show Gist options
  • Save kilian-gebhardt/d7e70e329cd696ee8b367af009e96fb7 to your computer and use it in GitHub Desktop.
Save kilian-gebhardt/d7e70e329cd696ee8b367af009e96fb7 to your computer and use it in GitHub Desktop.
script for ingestion of LREC 2016
#!/bin/python3
import bibtexparser
import subprocess
import sys, os
OUTPUT_ROOT = sys.argv[1]
ACL = "anthology/L/L16"
OUTPUT_DIR = os.path.join(OUTPUT_ROOT, ACL)
try:
os.makedirs(OUTPUT_DIR)
except OSError as exc:
if exc.errno == errno.EEXIST and os.path.isdir(path):
pass
else:
raise
# manually copy the bibtex data from bibtex.html into bibtex.bib
with open('bibtex.bib') as bib_file:
bib_db = bibtexparser.load(bib_file)
# build page number database and retrieve abstracts
page_numbers = {}
for key in bib_db.get_entry_dict().keys():
lrec_id = key.split('.')[-1]
output = subprocess.check_output(
['grep', '-a', '-E', "\([0-9]*\)Tj", "pdf/%s_Paper.pdf" % lrec_id])
# workaround, the above regex does not work always
if lrec_id == '859':
output = '(4440)Tj\n(4441)Tj\n(4442)Tj\n(4443)Tj\n(4444)Tj'
page_numbers[key] = pages = [int(o[1:-3]) for o in output.split()]
bib_db.get_entry_dict()[key]['pages'] = "%d--%d" % (min(pages), max(pages))
# retrieve abstract
proc = subprocess.Popen(["grep -C 1 '<td>Abstract</td>' summaries/%s.html | tail -n 1 | pandoc --from html --standalone -t latex --wrap=none | sed -n '/^\\\\begin{document}/,/^\\\\end{document}/p' | tail -n +3 | head -n -2" % lrec_id], stdout=subprocess.PIPE, shell=True)
abstract = proc.communicate()[0].decode('utf-8')
if abstract.isspace():
print("lrec_id", abstract)
bib_db.get_entry_dict()[key]['abstract'] = abstract
from bibtexparser.bwriter import BibTexWriter
writer = BibTexWriter()
with open('/tmp/bibtex_with_abstracts.bib', 'w') as bibfile:
bibfile.write(writer.write(bib_db))
ordered_ids = sorted(bib_db.get_entry_dict().keys(), key=lambda k: min(page_numbers[k]))
last = 0
for n, key in enumerate(ordered_ids):
acl_id = "L16-1%03d" % (n+1)
print(acl_id, key, bib_db.get_entry_dict()[key]['pages'])
lrec_id = key.split('.')[-1]
assert min(page_numbers[key]) == last + 1
last = max(page_numbers[key])
goal = os.path.join(OUTPUT_DIR, "%s.pdf" % acl_id)
subprocess.call("ln -s $PWD/pdf/%s_Paper.pdf %s" % (lrec_id, goal), shell=True)
bib_entry = bib_db.get_entry_dict()[key]
bib_entry['ID'] = acl_id
bib_entry['url'] = acl_id
bib_entry['month'] = 'May'
# avoid warning during anthology generation?!
# del bib_entry['editor']
db = bibtexparser.bibdatabase.BibDatabase()
db.entries = [bib_entry]
with open (os.path.join(OUTPUT_DIR, "%s.bib" % acl_id), 'w') as bibfile:
bibfile.write(writer.write(db))
proceedings= {}
proceedings['ID'] = 'L16-1000'
proceedings['ENTRYTYPE'] = 'proceedings'
proceedings['title'] = 'Proceedings of the Tenth International Conference on Language Resources and Evaluation ({LREC} 2016)'
proceedings['address'] = 'Paris, France'
proceedings['date'] = '23-28'
proceedings['author'] = 'Nicoletta Calzolari and Khalid Choukri and Thierry Declerck and Sara Goggi and Marko Grobelnik and Bente Maegaard and Joseph Mariani and Helene Mazo and Asuncion Moreno and Jan Odijk and Stelios Piperidis'
proceedings['isbn'] = '978-2-9517408-9-1'
proceedings['language'] = 'english'
proceedings['location'] = 'Portorož, Slovenia'
proceedings['month'] = 'May'
proceedings['publisher'] = 'European Language Resources Association (ELRA)'
# TODO: there is no front matter pdf
# proceedings['url'] = 'L16-1000'
proceedings['year'] = '2016'
bib_db.entries.append(proceedings)
bib_db.entries_dict['L16-1000'] = proceedings
db = bibtexparser.bibdatabase.BibDatabase()
db.entries = [proceedings]
with open (os.path.join(OUTPUT_DIR, "L16-1000.bib"), 'w') as bibfile:
bibfile.write(writer.write(db))
with open(os.path.join(OUTPUT_DIR, 'L16-1.bib'), 'w') as bibfile:
bibfile.write(writer.write(bib_db))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment