Last active
December 11, 2015 18:38
-
-
Save chrK/4642764 to your computer and use it in GitHub Desktop.
********* Moved to: https://github.com/chrK/pdf-rename ********* A Simple script to parse dates from pdfs and rename them with a date prefix and a keyword.
Run the script inside the folder with the pdfs you want to rename. ! Attention: This renames and moves your files. Be sure to have backups of the data you're working on.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import sys, getopt, os, re, time, locale, csv | |
from cStringIO import StringIO | |
try: | |
from pdfminer.pdfinterp import PDFResourceManager, process_pdf | |
from pdfminer.converter import TextConverter | |
from pdfminer.layout import LAParams | |
except ImportError: | |
print "\nError: Could not import PDFMiner. Please '$pip install pdfminer'.\n"\ | |
"\tModule description: http://www.unixuser.org/~euske/python/pdfminer/index.html" | |
def ensure_folder(path, folder_name): | |
# Check for folder. If it doesn't exist create it. | |
folder_path = (os.path.join(path, folder_name)) | |
if not os.path.isdir(folder_path): | |
os.makedirs(folder_path) | |
print "Created result folder '%s'.\n" % folder_name | |
return folder_path | |
def download_raw_gist(gist_id, file_to_save='gist.txt'): | |
# Import the additional modules needed. | |
from urllib2 import Request, urlopen, HTTPError | |
# Create the url and the request. | |
gist_raw_url = 'http://raw.github.com/gist/%s' % gist_id | |
request = Request(gist_raw_url) | |
# Open url and write it to the file. | |
try: | |
print "Trying to download %s" % gist_raw_url | |
keyword_file = urlopen(request) | |
with open(file_to_save, 'w') as local_keyword_file: | |
local_keyword_file.write(keyword_file.read()) | |
print "Successfully downloaded gist #%s.\n" % gist_id | |
except HTTPError, e: | |
print "Error: HTTP", e.code, gist_raw_url | |
def create_dict_from_csv(csv_file, gist_id): | |
# If csv doesn't exist download it from github. | |
if not os.path.exists(csv_file): | |
print "Didn't find %s." % csv_file | |
download_raw_gist(gist_id, csv_file) | |
# Open file and parse the file line per line. | |
keywords = {} | |
try: | |
with open(csv_file, 'rb') as file: | |
keyword_lists = csv.reader(file, delimiter=';') | |
for row in keyword_lists: | |
if len(row) == 1: | |
keywords[row[0].strip(' \t')] = row[0].strip(' \t') | |
else: | |
keywords[row[0].strip(' \t')] = row[1].strip(' \t') | |
except: | |
print "Couldn't create dictionary from csv." | |
return keywords | |
def convert_pdf_to_text(path, maxpages=1): | |
""" | |
Conversion of a pdf content to text. | |
Source: http://stackoverflow.com/questions/5725278/python-help-using-pdfminer-as-a-library | |
""" | |
rsrcmgr = PDFResourceManager() | |
retstr = StringIO() | |
codec = 'utf-8' | |
laparams = LAParams() | |
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) | |
with open(path, 'rb') as fp: | |
process_pdf(rsrcmgr, device, fp, maxpages=maxpages) | |
device.close() | |
str = retstr.getvalue() | |
retstr.close() | |
return str | |
def find_date_in_string(text, debug=False): | |
# The dictionary with all necessary data for the different date formats | |
# For each date format we need: | |
# - regular expression | |
# - corresponding python date formats | |
# - locales which should be supported | |
# - a placeholder for the regex match object | |
date_formats = { | |
'alphanumeric' : { | |
'regex' : r'([1-9]|[0-2][\d]|3[01])(\.\ |\.|\-|\/|\ )(Jan|JAN|Feb|FEB|Mar|MAR|Apr|APR|May|MAY|Jun|JUN|Jul|JUL|Aug|AUG|Sep|SEP|Oct|OCT|Nov|NOV|Dec|DEC|Mär|MÄR|Mai|MAI|Okt|OKT|Dez|DEZ|January|February|March|April|June|July|August|September|October|November|December|Januar|Februar|März|Juni|Juli|Oktober|Dezember)[\.\ \-\/]((199[\d]|20[0-2][\d]|2030)|9[\d]|(9[\d]|[12][\d]|30))', | |
'python_date_formats' : [ | |
'%d. %B %Y', '%d. %B %y', | |
'%d %B %Y', '%d %B %y', | |
'%d. %b %Y', '%d. %b %y', | |
'%d %b %Y', '%d %b %y', | |
'%d-%b-%Y', '%d-%b-%y', | |
'%d/%b/%Y', '%d/%b/%y', | |
'%d.%B.%Y', '%d.%B.%y', | |
'%d.%b.%Y', '%d.%b.%y', | |
'%d-%B-%Y', '%d-%B-%y', | |
'%d/%B/%Y', '%d/%B/%y', | |
], | |
'locales' : [ | |
'de_DE', | |
'en_US', | |
], | |
'regex_match' : False, | |
}, | |
'reverse numeric' : { | |
'regex' : r'(199[\d]|20[0-2][\d]|2030)[\.\-\/\ ]([1-9]|[0-2][\d]|3[01])[\.\-\/\ ]([1-9]|[0-2][\d]|3[01])', | |
'python_date_formats' : [ | |
'%Y-%m-%d', | |
'%Y/%m/%d', | |
'%Y.%m.%d', | |
], | |
'regex_match' : False, | |
'locales' : [ 'de_DE', ], | |
}, | |
'numeric' : { | |
'regex' : r'([1-9]|[0-2][\d]|3[01])[\.\-\/\ ]([1-9]|[012][\d]|3[01])[\.\-\/\ ]((199[\d]|20[0-2][\d]|2030)|9[\d]|(9[\d]|[0-2][\d]|30))', | |
'python_date_formats' : [ | |
# 'Normal' dates | |
'%d.%m.%Y', '%d.%m.%y', | |
'%d-%m-%Y', '%d-%m-%y', | |
'%d/%m/%Y', '%d/%m/%y', | |
# US dates | |
'%m.%d.%Y', '%m.%d.%y', | |
'%m-%d-%Y', '%m-%d-%y', | |
'%m/%d/%Y', '%m/%d/%y', | |
], | |
'locales' : [ 'de_DE', ], | |
'regex_match' : False, | |
}, | |
'domaindiscount' : { | |
'regex' : r'\d{2}/\d{2}th/\d{4}', | |
'python_date_formats' : [ | |
'%m/%dth/%Y', | |
], | |
'locales' : [ 'de_DE', ], | |
'regex_match' : False, | |
}, | |
} | |
# Iterate over date formats. | |
parsed_date = '' | |
for date_format_name, date_format_data in date_formats.iteritems(): | |
found_date = False | |
# Iterate over locales to have i18n date matching. | |
# (String formatting directives '%b' & '%B' are locale dependant.) | |
for locale_value in date_format_data['locales']: | |
locale.setlocale(locale.LC_ALL, locale_value) | |
date_format_data['regex_match'] = re.search(date_format_data['regex'], text) | |
if date_format_data['regex_match']: | |
if debug: print "\tRegex match: %s" % date_format_data['regex_match'].group(0) | |
for date_format in date_format_data['python_date_formats']: | |
try: | |
parsed_date = time.strptime(date_format_data['regex_match'].group(0), date_format) | |
if debug: print "\tFound %s date with the format of '%s'." % (date_format_name, date_format) | |
return parsed_date | |
except: | |
pass | |
else: | |
if debug: print "\tNotice: Could not retrieve a date." | |
return | |
def find_first_keyword_in_string(text, keywords, debug=False): | |
keyword_string = '' | |
try: | |
# Convert pdf_content to list. | |
text_as_list = text.split() | |
# Check for keywords: | |
for word in text_as_list: | |
for keyword, replacement in keywords.iteritems(): | |
if keyword in word: | |
keyword_string += "%s_" % replacement | |
if debug: print "\tFound the keyword '%s'." % keyword | |
return keyword_string | |
else: | |
if debug: print "\tDidn't find a keyword." | |
except: | |
if debug: print "\tError: Could not process for keywords." | |
pass | |
return keyword_string | |
def prefix_filename(filename, prefix): | |
new_filename = prefix + filename | |
os.rename(filename, new_filename) | |
return new_filename | |
def main(argv): | |
script_name = argv[0].split(os.sep)[-1] | |
def usage(): | |
print ('Usage: %s [-v --verbose] [-d --debug] [-g --gist <gist-id>][-p --path <path-name>] ' | |
'[-f --date-format <python-date-format-string>] [-r --result-folder <result-folder>] ' | |
'[-a --all-pages] [-h --help]' % script_name) | |
try: | |
opts, arguments = getopt.getopt(argv[1:], "hvdg:f:r:p:ak:", ['help', 'verbose', 'debug', 'gist-id=', 'date-format=', 'result-folder=', 'path=', 'all-pages', 'keywords-file=']) | |
except getopt.GetoptError as err: | |
# Print help information and exit. | |
print "Error: %s.\nTry '%s --help' for more information." % (str(err), script_name) | |
sys.exit() | |
verbose = False | |
debug = False | |
pages_of_pdf_to_parse = 1 | |
prefix_date_format = '%Y-%m-%d' | |
result_folder_name = 'processed' | |
working_directory = os.getcwd() | |
keywords_file = 'pdf-keywords.txt' | |
gist_id = 4660290 | |
for opt, arg in opts: | |
if opt in ('-v', '--verbose'): | |
verbose = True | |
elif opt in ('-d', '--debug'): | |
debug = True | |
elif opt in ('-g', '--gist-id'): | |
gist_id = arg | |
elif opt in ('-f', '--date-format'): | |
prefix_date_format = arg | |
elif opt in ('-r', '--result-folder'): | |
result_folder_name = arg | |
elif opt in ('-p', '--path'): | |
os.chdir(arg) | |
working_directory = os.getcwd() | |
elif opt in ('-a', '--all-pages'): | |
pages_of_pdf_to_parse = 0 | |
elif opt in ('-h', '--help'): | |
usage() | |
return | |
elif opt in ('-k', '--keywords-file'): | |
keywords_file = arg | |
else: | |
assert False, "unhandled option" | |
# Get list of files and check if there are any files to work on. | |
pdfs_to_process = [file for file in os.listdir(working_directory) if file.endswith('.pdf') or file.endswith('.PDF')] | |
if len(pdfs_to_process) > 0: | |
print '\nFound %s pdf-files to process.\n' % str(len(pdfs_to_process)) | |
if prefix_date_format != '%Y-%m-%d': | |
print "Will use custom date format '%s'" % prefix_date_format | |
else: | |
print "\nDidn't find any pdf-files in the given folder.\n" | |
return | |
# Check if result folder exists. | |
result_folder = ensure_folder(working_directory, result_folder_name) | |
# Define keywords: | |
keywords = create_dict_from_csv(keywords_file, gist_id) | |
# Iterate over these files, try to parse the dates and build | |
# a dictionary with all data we need to rename them. | |
pdfs_to_rename = {} | |
for pdf in pdfs_to_process: | |
if verbose or debug: print "Processing '%s'" % pdf | |
# Reset loop variables. | |
pdf_content, pdf_date, pdf_keyword = '', '', '' | |
try: | |
pdf_content = convert_pdf_to_text(pdf, pages_of_pdf_to_parse) | |
if pdf_content: | |
pdf_date = time.strftime(prefix_date_format, find_date_in_string(pdf_content, debug)) # TODO: A Fail of find_date_in_sting() should be escaped separated. | |
pdf_keyword = find_first_keyword_in_string(pdf_content, keywords, debug) | |
pdfs_to_rename[pdf] = "%s_%s" % (pdf_date, pdf_keyword) | |
except: | |
print "Notice: Could not parse '%s'" % pdf | |
# Rename & move the pdfs. | |
for pdf, prefix in pdfs_to_rename.iteritems(): | |
new_pdf = prefix_filename(pdf, prefix) | |
os.rename(new_pdf, os.path.join(result_folder, new_pdf)) | |
# Report how many PDFs could be matched to a date. | |
print "\nI could extract a date from %s of %s PDFs.\n" % (str(len(pdfs_to_rename)), str(len(pdfs_to_process))) | |
if __name__ == '__main__': sys.exit(main(sys.argv)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
sweet :-)