andreasvc · January 31, 2016 17:37
diff --git a/aclrename.py b/aclrename.py
 """Script to rename papers from ACL Anthology to 'author year title.pdf'

 Given PDF files from the ACL anthology http://aclweb.org/anthology/
 downloads bibtex file and extracts author, year, title
 to suggest more descriptive names.

 Before: N04-1016.pdf
 After: Lapata & Keller 2004 The Web as a Baseline: Evaluating the Perform[...]

 Usage:
 $ python3 aclrename.py >/tmp/rename.sh
 $ # do post-editing on /tmp/rename.sh
 $ bash /tmp/rename.sh
 """
 import re
 import sys
 import glob
 import time
 import requests

 EXAMPLE = '''@inproceedings{lapata-keller:2004:HLTNAACL,
  author    = {Lapata, Mirella  and  Keller, Frank},
  title     = {The Web as a Baseline: Evaluating the Performance of \
 Unsupervised Web-based Models for a Range of NLP Tasks},
  booktitle = {HLT-NAACL 2004: Main Proceedings },
  editor = {Susan Dumais, Daniel Marcu and Salim Roukos},
  year      = 2004,
  month     = {May 2 - May 7},
  address   = {Boston, Massachusetts, USA},
  publisher = {Association for Computational Linguistics},
  pages     = {121--128}
 }'''

 ACLPAPER = re.compile(r'^((([JPNECDQWKHRT])\d{2})-\d{4})\.pdf$')
 BIBLINE = re.compile(r'^\s*(\S+)\s*=\s*(?:\{(.*)\}|(.*)),?\s*$')
 # http://aclweb.org/anthology/N/N04/N04-1016.bib
 URLTEMPLATE = 'http://aclweb.org/anthology/%s/%s/%s.bib'
 ALLCAPS = re.compile(r'^[A-Z\W]+$')


 def main():
 	"""Suggest new filenames for all ACL papers in current directory."""
 	for filename in glob.glob('*.pdf'):
 		match = ACLPAPER.match(filename)
 		if match:
 			time.sleep(0.5)
 			bib = requests.get(URLTEMPLATE % (
 					match.group(3), match.group(2), match.group(1)))
 			if bib.status_code != 200:
 				print('could not get bib (%s): %s' % (
 						bib.status_code, filename), file=sys.stderr)
 				continue
 			author, year, title = parsebib(bib.content.decode('utf8'))
 			newfilename = '%s %s %s.pdf' % (author, year, title)
 			newfilename = newfilename.replace('/', '').replace('\\', '')
 			print('mv %s "%s"' % (filename, newfilename))
 			print('SUCCESS:', filename, file=sys.stderr)


 def parsebib(bib):
 	"""Parse a bibtex string and return (author, year, title)."""
 	data = {}
 	for line in bib.splitlines():
 		bibmatch = BIBLINE.match(line)
 		if bibmatch is None:
 			continue
 			# raise ValueError('error with line: %s' % line)
 		data[bibmatch.group(1).lower()] = (
 				bibmatch.group(2) or bibmatch.group(3)).strip('{},')
 	if 'year' not in data or 'author' not in data or 'title' not in data:
 		print(bib, file=sys.stderr)
 		print(data, file=sys.stderr)
 		raise ValueError
 	year = data['year']
 	title = data['title'][:120].replace('{', '').replace('}', '')
 	author = lastname(data['author'])
 	if data['author'].count(' and ') > 1:  # et al
 		author += ' et al.'
 	elif data['author'].count(' and ') == 1:  # A & B
 		author += ' & ' + lastname(data['author'].split(' and ')[1])
 	if ALLCAPS.match(author):
 		author = author.title()
 	if ALLCAPS.match(title):
 		title = title.title()
 	# FIXME: handle accents
 	return author, year, title


 def lastname(name):
 	"""Take first name from string and return last name."""
 	if ',' in name:
 		return name[:name.index(',')].strip()
 	return name.split(' and ')[0].split()[-1].strip()

 if __name__ == '__main__':
 	main()
	"""Script to rename papers from ACL Anthology to 'author year title.pdf'

	Given PDF files from the ACL anthology http://aclweb.org/anthology/
	downloads bibtex file and extracts author, year, title
	to suggest more descriptive names.

	Before: N04-1016.pdf
	After: Lapata & Keller 2004 The Web as a Baseline: Evaluating the Perform[...]

	Usage:
	$ python3 aclrename.py >/tmp/rename.sh
	$ # do post-editing on /tmp/rename.sh
	$ bash /tmp/rename.sh
	"""
	import re
	import sys
	import glob
	import time
	import requests

	EXAMPLE = '''@inproceedings{lapata-keller:2004:HLTNAACL,
	author = {Lapata, Mirella and Keller, Frank},
	title = {The Web as a Baseline: Evaluating the Performance of \
	Unsupervised Web-based Models for a Range of NLP Tasks},
	booktitle = {HLT-NAACL 2004: Main Proceedings },
	editor = {Susan Dumais, Daniel Marcu and Salim Roukos},
	year = 2004,
	month = {May 2 - May 7},
	address = {Boston, Massachusetts, USA},
	publisher = {Association for Computational Linguistics},
	pages = {121--128}
	}'''

	ACLPAPER = re.compile(r'^((([JPNECDQWKHRT])\d{2})-\d{4})\.pdf$')
	BIBLINE = re.compile(r'^\s(\S+)\s=\s(?:\{(.)\}\|(.)),?\s$')
	# http://aclweb.org/anthology/N/N04/N04-1016.bib
	URLTEMPLATE = 'http://aclweb.org/anthology/%s/%s/%s.bib'
	ALLCAPS = re.compile(r'^[A-Z\W]+$')


	def main():
	"""Suggest new filenames for all ACL papers in current directory."""
	for filename in glob.glob('*.pdf'):
	match = ACLPAPER.match(filename)
	if match:
	time.sleep(0.5)
	bib = requests.get(URLTEMPLATE % (
	match.group(3), match.group(2), match.group(1)))
	if bib.status_code != 200:
	print('could not get bib (%s): %s' % (
	bib.status_code, filename), file=sys.stderr)
	continue
	author, year, title = parsebib(bib.content.decode('utf8'))
	newfilename = '%s %s %s.pdf' % (author, year, title)
	newfilename = newfilename.replace('/', '').replace('\\', '')
	print('mv %s "%s"' % (filename, newfilename))
	print('SUCCESS:', filename, file=sys.stderr)


	def parsebib(bib):
	"""Parse a bibtex string and return (author, year, title)."""
	data = {}
	for line in bib.splitlines():
	bibmatch = BIBLINE.match(line)
	if bibmatch is None:
	continue
	# raise ValueError('error with line: %s' % line)
	data[bibmatch.group(1).lower()] = (
	bibmatch.group(2) or bibmatch.group(3)).strip('{},')
	if 'year' not in data or 'author' not in data or 'title' not in data:
	print(bib, file=sys.stderr)
	print(data, file=sys.stderr)
	raise ValueError
	year = data['year']
	title = data['title'][:120].replace('{', '').replace('}', '')
	author = lastname(data['author'])
	if data['author'].count(' and ') > 1: # et al
	author += ' et al.'
	elif data['author'].count(' and ') == 1: # A & B
	author += ' & ' + lastname(data['author'].split(' and ')[1])
	if ALLCAPS.match(author):
	author = author.title()
	if ALLCAPS.match(title):
	title = title.title()
	# FIXME: handle accents
	return author, year, title


	def lastname(name):
	"""Take first name from string and return last name."""
	if ',' in name:
	return name[:name.index(',')].strip()
	return name.split(' and ')[0].split()[-1].strip()

	if __name__ == '__main__':
	main()
No results found