Last active
August 18, 2022 16:22
-
-
Save sgrontflix/e31a32721533fb07e821aba9440d813a to your computer and use it in GitHub Desktop.
Simple script that allows you to merge PDF files. You can also specify which pages to consider when merging.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
import re | |
from pathlib import Path | |
from PyPDF2 import PdfMerger | |
from PyPDF2.errors import PyPdfError | |
def parse_pages(pages_list, files): | |
# no ranges => select all pages | |
# "None" means "all pages" in pypdf2 | |
if not pages_list: | |
return [[None]] * len(files) | |
r = re.compile(r'^(\(((?:(-?\d+ )?(-?\d+,-?\d+ )?(all )?)*)((-?\d+)|(-?\d+,-?\d+)|all)\))|all$') | |
# will contain lists of ranges | |
# where each list is associated with a file | |
# e.g. [ [(0, 1), (2, 4)], [(1, 3)] ] | |
parsed_pages = [] | |
for file, pages in zip(files, pages_list): | |
if not r.match(pages): | |
print(f'Syntax error: \"{pages}\".') | |
return None | |
# will contain parsed ranges for the current file | |
# e.g. [ (0, 1), (2, 4) ] | |
file_pages = [] | |
for p in pages.replace('(', '').replace(')', '').split(' '): | |
if p == 'all': | |
file_pages.append(None) | |
continue | |
# we can either have a single page number (x) or a range (x-y) | |
try: | |
file_pages.append((int(p), int(p) + 1)) | |
except ValueError: | |
to_add = tuple(int(n) for n in p.split(',')) | |
if to_add[0] >= to_add[1]: | |
print(f'Start page is greater than or equal to final page in \"{p}\" for file \"{file}\".') | |
return None | |
file_pages.append(to_add) | |
parsed_pages.append(file_pages) | |
return parsed_pages | |
def parse_arguments(): | |
parser = argparse.ArgumentParser(description='Simple PDF merger', usage='pdf_merge.py [-h] [-p [PAGES_LIST ...]] ' | |
'[-o OUTFILE] file1 [other_files ...]') | |
parser.add_argument('files', nargs='+', help='List of files to merge\n' | |
'You can specify a single file ' | |
'if you only want to extract certain pages') | |
parser.add_argument('-p', '--pages-list', nargs='*', help='Pages to consider when merging:\n' | |
'\"(0,2)\" => first two pages\n' | |
'\"(0 2,5)\" => first page + pages from third to fifth\n' | |
'\"(-1)\" => last page\n' | |
'\"all\" => the whole document') | |
parser.add_argument('-o', '--outfile', default='merged.pdf', help='Name of output file') | |
parser.formatter_class = argparse.RawTextHelpFormatter | |
args = parser.parse_args() | |
files, pages_list, outfile = args.files, args.pages_list, args.outfile | |
if pages_list and len(pages_list) > len(files): | |
print(f'Number of ranges is greater than number of files. ' | |
f'Last {len(pages_list)-len(files)} range(s) will be ignored.') | |
pages_list = parse_pages(pages_list[:len(files)-len(pages_list)], files) | |
elif pages_list and len(pages_list) < len(files): | |
print(f'Number of ranges is less than number of files. ' | |
f'Last {len(files)-len(pages_list)} file(s) will be treated in full.') | |
pages_list = parse_pages(pages_list, files) | |
pages_list and pages_list.extend([[None]] * (len(files)-len(pages_list))) | |
else: | |
pages_list = parse_pages(pages_list, files) | |
if not re.match(r'^.*\.pdf$', outfile): | |
outfile = outfile + '.pdf' | |
if outfile in files: | |
print(f'Output file name ({outfile}) is the same as one of the specified files.') | |
outfile = None | |
return files, pages_list, outfile | |
def main(): | |
files, pages_list, outfile = parse_arguments() | |
if not pages_list or not outfile: | |
return | |
with PdfMerger() as merger: | |
for file, pages in zip(files, pages_list): | |
if not file or not Path(file).is_file(): | |
print(f'Invalid path: \"{file}\".') | |
return | |
for p in pages: | |
try: | |
merger.append(file, pages=p) | |
except PyPdfError: | |
print(f'Invalid file: \"{file}\".') | |
return | |
except IndexError: | |
print(f'Page range \"{p}\" out of bounds for file \"{file}\".') | |
return | |
try: | |
merger.write(outfile) | |
print(f'Files successfully merged into \"{outfile}\".') | |
except PyPdfError: | |
print('Couldn\'t merge files.') | |
except PermissionError: | |
print(f'Couldn\'t write to output file: permission denied.') | |
except OSError: | |
print(f'Invalid output file name: \"{outfile}\".') | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment