Last active
May 11, 2019 16:24
-
-
Save kilian-gebhardt/d7e70e329cd696ee8b367af009e96fb7 to your computer and use it in GitHub Desktop.
script for ingestion of LREC 2016
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/python3 | |
import bibtexparser | |
import subprocess | |
import sys, os | |
OUTPUT_ROOT = sys.argv[1] | |
ACL = "anthology/L/L16" | |
OUTPUT_DIR = os.path.join(OUTPUT_ROOT, ACL) | |
try: | |
os.makedirs(OUTPUT_DIR) | |
except OSError as exc: | |
if exc.errno == errno.EEXIST and os.path.isdir(path): | |
pass | |
else: | |
raise | |
# manually copy the bibtex data from bibtex.html into bibtex.bib | |
with open('bibtex.bib') as bib_file: | |
bib_db = bibtexparser.load(bib_file) | |
# build page number database and retrieve abstracts | |
page_numbers = {} | |
for key in bib_db.get_entry_dict().keys(): | |
lrec_id = key.split('.')[-1] | |
output = subprocess.check_output( | |
['grep', '-a', '-E', "\([0-9]*\)Tj", "pdf/%s_Paper.pdf" % lrec_id]) | |
# workaround, the above regex does not work always | |
if lrec_id == '859': | |
output = '(4440)Tj\n(4441)Tj\n(4442)Tj\n(4443)Tj\n(4444)Tj' | |
page_numbers[key] = pages = [int(o[1:-3]) for o in output.split()] | |
bib_db.get_entry_dict()[key]['pages'] = "%d--%d" % (min(pages), max(pages)) | |
# retrieve abstract | |
proc = subprocess.Popen(["grep -C 1 '<td>Abstract</td>' summaries/%s.html | tail -n 1 | pandoc --from html --standalone -t latex --wrap=none | sed -n '/^\\\\begin{document}/,/^\\\\end{document}/p' | tail -n +3 | head -n -2" % lrec_id], stdout=subprocess.PIPE, shell=True) | |
abstract = proc.communicate()[0].decode('utf-8') | |
if abstract.isspace(): | |
print("lrec_id", abstract) | |
bib_db.get_entry_dict()[key]['abstract'] = abstract | |
from bibtexparser.bwriter import BibTexWriter | |
writer = BibTexWriter() | |
with open('/tmp/bibtex_with_abstracts.bib', 'w') as bibfile: | |
bibfile.write(writer.write(bib_db)) | |
ordered_ids = sorted(bib_db.get_entry_dict().keys(), key=lambda k: min(page_numbers[k])) | |
last = 0 | |
for n, key in enumerate(ordered_ids): | |
acl_id = "L16-1%03d" % (n+1) | |
print(acl_id, key, bib_db.get_entry_dict()[key]['pages']) | |
lrec_id = key.split('.')[-1] | |
assert min(page_numbers[key]) == last + 1 | |
last = max(page_numbers[key]) | |
goal = os.path.join(OUTPUT_DIR, "%s.pdf" % acl_id) | |
subprocess.call("ln -s $PWD/pdf/%s_Paper.pdf %s" % (lrec_id, goal), shell=True) | |
bib_entry = bib_db.get_entry_dict()[key] | |
bib_entry['ID'] = acl_id | |
bib_entry['url'] = acl_id | |
bib_entry['month'] = 'May' | |
# avoid warning during anthology generation?! | |
# del bib_entry['editor'] | |
db = bibtexparser.bibdatabase.BibDatabase() | |
db.entries = [bib_entry] | |
with open (os.path.join(OUTPUT_DIR, "%s.bib" % acl_id), 'w') as bibfile: | |
bibfile.write(writer.write(db)) | |
proceedings= {} | |
proceedings['ID'] = 'L16-1000' | |
proceedings['ENTRYTYPE'] = 'proceedings' | |
proceedings['title'] = 'Proceedings of the Tenth International Conference on Language Resources and Evaluation ({LREC} 2016)' | |
proceedings['address'] = 'Paris, France' | |
proceedings['date'] = '23-28' | |
proceedings['author'] = 'Nicoletta Calzolari and Khalid Choukri and Thierry Declerck and Sara Goggi and Marko Grobelnik and Bente Maegaard and Joseph Mariani and Helene Mazo and Asuncion Moreno and Jan Odijk and Stelios Piperidis' | |
proceedings['isbn'] = '978-2-9517408-9-1' | |
proceedings['language'] = 'english' | |
proceedings['location'] = 'Portorož, Slovenia' | |
proceedings['month'] = 'May' | |
proceedings['publisher'] = 'European Language Resources Association (ELRA)' | |
# TODO: there is no front matter pdf | |
# proceedings['url'] = 'L16-1000' | |
proceedings['year'] = '2016' | |
bib_db.entries.append(proceedings) | |
bib_db.entries_dict['L16-1000'] = proceedings | |
db = bibtexparser.bibdatabase.BibDatabase() | |
db.entries = [proceedings] | |
with open (os.path.join(OUTPUT_DIR, "L16-1000.bib"), 'w') as bibfile: | |
bibfile.write(writer.write(db)) | |
with open(os.path.join(OUTPUT_DIR, 'L16-1.bib'), 'w') as bibfile: | |
bibfile.write(writer.write(bib_db)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment