Skip to content

Instantly share code, notes, and snippets.

@kilian-gebhardt
Created June 24, 2019 15:44
Show Gist options
  • Select an option

  • Save kilian-gebhardt/f25c7d9124d7e45c2a09688ed81a40ad to your computer and use it in GitHub Desktop.

Select an option

Save kilian-gebhardt/f25c7d9124d7e45c2a09688ed81a40ad to your computer and use it in GitHub Desktop.
Automate the creation of IWPT 1989 proceedings and metadata
#!/bin/python
import re
import subprocess
MATCHLINE = re.compile(r'^(</li>)?<li>(.*)[,.].*\.\s+(.*)\; p\. ([0-9]+)-([0-9]+)\.$')
counter = 0
ID='W89-01%02i'
last_page = 0
OFFSET = 10
LAST_CORRECT_PAGE = 374
def correct_page(x):
if x > LAST_CORRECT_PAGE:
if x % 2 == 0:
return x + 1
else:
return x - 1
else:
return x
with open('iwpt89_toc.html') as f:
for line in f:
m = MATCHLINE.match(line)
if m:
counter += 1
authors = list(map(lambda x: x[:-1] if x[-1] in '*' else x, re.split(' and | &amp; |, ', m.group(2))))
title = m.group(3)
pages = int(m.group(4)), int(m.group(5))
if counter == 29:
pages = pages[0], 285
# print(pages, last_page)
assert pages[0] == last_page + 1
last_page = pages[1]
s = '@inproceedings{' + ID % counter + ",\n"
s += '\t' + 'author = {' + ' and '.join(', '.join(reversed(author.rsplit(maxsplit=1))) for author in authors) + '},\n'
s += '\t' + 'title = {' + title + '},\n'
s += '\tpages = {' + '%i--%i' % pages + '}\n'
s += '\tbooktitle = {Proceedings of the First International Workshop on Parsing Technologies},\n'
s += '\taddress = {Pittsburgh, Pennsylvania, USA},\n'
s += '\tmonth = aug,\n'
s += '\tyear = 1989,\n'
s += '}\n'
with open('IWPT1989-anthology/' + ID % counter + '.bib', 'w') as bib:
print(s, file=bib)
pdf_pages = map(str,[OFFSET + correct_page(x) for x in range(pages[0], pages[1] + 1)])
subprocess.call(['pdftk', 'IWPT_1989.pdf', 'cat'] + list(pdf_pages) + ['output', 'IWPT1989-anthology/' + ID % counter + '.pdf'])
assert last_page == 467
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment