Created
January 31, 2016 17:37
-
-
Save andreasvc/f539b479d9af744d0541 to your computer and use it in GitHub Desktop.
Script to rename papers from ACL Anthology to 'author year title.pdf'
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Script to rename papers from ACL Anthology to 'author year title.pdf' | |
Given PDF files from the ACL anthology http://aclweb.org/anthology/ | |
downloads bibtex file and extracts author, year, title | |
to suggest more descriptive names. | |
Before: N04-1016.pdf | |
After: Lapata & Keller 2004 The Web as a Baseline: Evaluating the Perform[...] | |
Usage: | |
$ python3 aclrename.py >/tmp/rename.sh | |
$ # do post-editing on /tmp/rename.sh | |
$ bash /tmp/rename.sh | |
""" | |
import re | |
import sys | |
import glob | |
import time | |
import requests | |
EXAMPLE = '''@inproceedings{lapata-keller:2004:HLTNAACL, | |
author = {Lapata, Mirella and Keller, Frank}, | |
title = {The Web as a Baseline: Evaluating the Performance of \ | |
Unsupervised Web-based Models for a Range of NLP Tasks}, | |
booktitle = {HLT-NAACL 2004: Main Proceedings }, | |
editor = {Susan Dumais, Daniel Marcu and Salim Roukos}, | |
year = 2004, | |
month = {May 2 - May 7}, | |
address = {Boston, Massachusetts, USA}, | |
publisher = {Association for Computational Linguistics}, | |
pages = {121--128} | |
}''' | |
ACLPAPER = re.compile(r'^((([JPNECDQWKHRT])\d{2})-\d{4})\.pdf$') | |
BIBLINE = re.compile(r'^\s*(\S+)\s*=\s*(?:\{(.*)\}|(.*)),?\s*$') | |
# http://aclweb.org/anthology/N/N04/N04-1016.bib | |
URLTEMPLATE = 'http://aclweb.org/anthology/%s/%s/%s.bib' | |
ALLCAPS = re.compile(r'^[A-Z\W]+$') | |
def main(): | |
"""Suggest new filenames for all ACL papers in current directory.""" | |
for filename in glob.glob('*.pdf'): | |
match = ACLPAPER.match(filename) | |
if match: | |
time.sleep(0.5) | |
bib = requests.get(URLTEMPLATE % ( | |
match.group(3), match.group(2), match.group(1))) | |
if bib.status_code != 200: | |
print('could not get bib (%s): %s' % ( | |
bib.status_code, filename), file=sys.stderr) | |
continue | |
author, year, title = parsebib(bib.content.decode('utf8')) | |
newfilename = '%s %s %s.pdf' % (author, year, title) | |
newfilename = newfilename.replace('/', '').replace('\\', '') | |
print('mv %s "%s"' % (filename, newfilename)) | |
print('SUCCESS:', filename, file=sys.stderr) | |
def parsebib(bib): | |
"""Parse a bibtex string and return (author, year, title).""" | |
data = {} | |
for line in bib.splitlines(): | |
bibmatch = BIBLINE.match(line) | |
if bibmatch is None: | |
continue | |
# raise ValueError('error with line: %s' % line) | |
data[bibmatch.group(1).lower()] = ( | |
bibmatch.group(2) or bibmatch.group(3)).strip('{},') | |
if 'year' not in data or 'author' not in data or 'title' not in data: | |
print(bib, file=sys.stderr) | |
print(data, file=sys.stderr) | |
raise ValueError | |
year = data['year'] | |
title = data['title'][:120].replace('{', '').replace('}', '') | |
author = lastname(data['author']) | |
if data['author'].count(' and ') > 1: # et al | |
author += ' et al.' | |
elif data['author'].count(' and ') == 1: # A & B | |
author += ' & ' + lastname(data['author'].split(' and ')[1]) | |
if ALLCAPS.match(author): | |
author = author.title() | |
if ALLCAPS.match(title): | |
title = title.title() | |
# FIXME: handle accents | |
return author, year, title | |
def lastname(name): | |
"""Take first name from string and return last name.""" | |
if ',' in name: | |
return name[:name.index(',')].strip() | |
return name.split(' and ')[0].split()[-1].strip() | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment