Created
August 2, 2023 14:46
-
-
Save mkbabb/04372183adce60fed42f03ebdef13763 to your computer and use it in GitHub Desktop.
Unfurls a PDF into multiple, or a single, PDF
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import argparse | |
| import tempfile | |
| import zipfile | |
| from io import BytesIO | |
| from pathlib import Path | |
| from typing import Optional | |
| import docx2pdf | |
| import pypdf | |
| from pypdf.generic import AnnotationBuilder | |
| def append_header( | |
| name: str, | |
| page: pypdf.PageObject, | |
| writer: pypdf.PdfWriter, | |
| ): | |
| annotation = AnnotationBuilder.free_text( | |
| name, | |
| rect=( | |
| page.mediabox.right, | |
| page.mediabox.top, | |
| page.mediabox.left, | |
| page.mediabox.bottom - 50, | |
| ), | |
| font="Arial", | |
| bold=True, | |
| italic=True, | |
| font_size="20pt", | |
| font_color="00ff00", | |
| border_color="0000ff", | |
| ) | |
| writer.add_annotation(page, annotation) | |
| return page | |
| def convert_pdf(input_file: Path, in_place: bool) -> pypdf.PdfWriter: | |
| output_file = ( | |
| input_file | |
| if in_place | |
| else input_file.with_stem(f"{input_file.stem}_normalized") | |
| ) | |
| pdf = pypdf.PdfReader(input_file) | |
| writer = pypdf.PdfWriter() | |
| for n, page in enumerate(pdf.pages): | |
| if n == 0 and len(pdf.attachments): | |
| continue | |
| writer.add_page(page) | |
| for name, attachment in pdf.attachments.items(): | |
| for b in attachment: | |
| try: | |
| with BytesIO(b) as f: | |
| t_pdf = pypdf.PdfReader(f) | |
| for n, page in enumerate(t_pdf.pages): | |
| # if n == 0: | |
| # page = append_header(name, page, writer) | |
| writer.add_page(page) | |
| except Exception as e: | |
| print(f"Failed to process attachment {name}: {e}") | |
| pass | |
| with open(output_file, "wb") as f: | |
| writer.write(f) | |
| return output_file | |
| def convert_docx(input_file: Path, in_place: bool) -> pypdf.PdfWriter: | |
| output_file = input_file.with_suffix(".pdf") | |
| docx2pdf.convert(input_file, output_file) | |
| if in_place: | |
| input_file.unlink() | |
| return output_file | |
| def convert_file(input_file: Path, in_place: bool) -> Optional[pypdf.PdfWriter]: | |
| path = None | |
| if input_file.suffix == ".pdf": | |
| path = convert_pdf(input_file, in_place) | |
| elif input_file.suffix == ".docx": | |
| path = convert_docx(input_file, in_place) | |
| if path is not None: | |
| writer = pypdf.PdfWriter() | |
| for page in pypdf.PdfReader(path).pages: | |
| writer.add_page(page) | |
| return writer | |
| def process_directory( | |
| input_dir: Path, combine: bool, in_place: bool | |
| ) -> Optional[pypdf.PdfWriter]: | |
| combined_writer = pypdf.PdfWriter() if combine else None | |
| for input_file in input_dir.glob("*"): | |
| try: | |
| converted_writer = convert_file(input_file, in_place) | |
| if combine and converted_writer is not None: | |
| for page in converted_writer.pages: | |
| combined_writer.add_page(page) | |
| except: | |
| print(f"Failed to convert {input_file}") | |
| pass | |
| return combined_writer | |
| def convert_zip( | |
| input_file: Path, combine: bool, in_place: bool | |
| ) -> Optional[pypdf.PdfWriter]: | |
| with tempfile.TemporaryDirectory() as tmpdirname: | |
| tmpdir = Path(tmpdirname) | |
| with zipfile.ZipFile(input_file, "r") as zip_ref: | |
| zip_ref.extractall(tmpdir) | |
| combined_writer = process_directory(tmpdir, combine, in_place) | |
| return combined_writer | |
| def convert(input_dir: Path, combine: bool, output_file: Path, in_place: bool) -> None: | |
| def _convert(): | |
| if input_dir.suffix == ".zip": | |
| return convert_zip(input_dir, combine, in_place) | |
| else: | |
| return process_directory(input_dir, combine, in_place) | |
| combined_writer = _convert() | |
| if combine and combined_writer is not None: | |
| with open(output_file, "wb") as f: | |
| combined_writer.write(f) | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("-i", "--input_dir", type=Path, required=True) | |
| parser.add_argument("-c", "--combine", action="store_true") | |
| parser.add_argument( | |
| "-o", "--output_file", type=Path, default="./combined.pdf", required=False | |
| ) | |
| parser.add_argument("--in_place", action="store_true", default=True) | |
| args = parser.parse_args() | |
| convert(args.input_dir, args.combine, args.output_file, args.in_place) | |
| # input_dir = Path("./data/bead_comments.zip") | |
| # combine = True | |
| # output_file = Path("./combined.pdf") | |
| # in_place = True | |
| # convert(input_dir, combine, output_file, in_place) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment