Skip to content

Instantly share code, notes, and snippets.

@mkbabb
Created August 2, 2023 14:46
Show Gist options
  • Select an option

  • Save mkbabb/04372183adce60fed42f03ebdef13763 to your computer and use it in GitHub Desktop.

Select an option

Save mkbabb/04372183adce60fed42f03ebdef13763 to your computer and use it in GitHub Desktop.
Unfurls a PDF into multiple, or a single, PDF
import argparse
import tempfile
import zipfile
from io import BytesIO
from pathlib import Path
from typing import Optional
import docx2pdf
import pypdf
from pypdf.generic import AnnotationBuilder
def append_header(
name: str,
page: pypdf.PageObject,
writer: pypdf.PdfWriter,
):
annotation = AnnotationBuilder.free_text(
name,
rect=(
page.mediabox.right,
page.mediabox.top,
page.mediabox.left,
page.mediabox.bottom - 50,
),
font="Arial",
bold=True,
italic=True,
font_size="20pt",
font_color="00ff00",
border_color="0000ff",
)
writer.add_annotation(page, annotation)
return page
def convert_pdf(input_file: Path, in_place: bool) -> pypdf.PdfWriter:
output_file = (
input_file
if in_place
else input_file.with_stem(f"{input_file.stem}_normalized")
)
pdf = pypdf.PdfReader(input_file)
writer = pypdf.PdfWriter()
for n, page in enumerate(pdf.pages):
if n == 0 and len(pdf.attachments):
continue
writer.add_page(page)
for name, attachment in pdf.attachments.items():
for b in attachment:
try:
with BytesIO(b) as f:
t_pdf = pypdf.PdfReader(f)
for n, page in enumerate(t_pdf.pages):
# if n == 0:
# page = append_header(name, page, writer)
writer.add_page(page)
except Exception as e:
print(f"Failed to process attachment {name}: {e}")
pass
with open(output_file, "wb") as f:
writer.write(f)
return output_file
def convert_docx(input_file: Path, in_place: bool) -> pypdf.PdfWriter:
output_file = input_file.with_suffix(".pdf")
docx2pdf.convert(input_file, output_file)
if in_place:
input_file.unlink()
return output_file
def convert_file(input_file: Path, in_place: bool) -> Optional[pypdf.PdfWriter]:
path = None
if input_file.suffix == ".pdf":
path = convert_pdf(input_file, in_place)
elif input_file.suffix == ".docx":
path = convert_docx(input_file, in_place)
if path is not None:
writer = pypdf.PdfWriter()
for page in pypdf.PdfReader(path).pages:
writer.add_page(page)
return writer
def process_directory(
input_dir: Path, combine: bool, in_place: bool
) -> Optional[pypdf.PdfWriter]:
combined_writer = pypdf.PdfWriter() if combine else None
for input_file in input_dir.glob("*"):
try:
converted_writer = convert_file(input_file, in_place)
if combine and converted_writer is not None:
for page in converted_writer.pages:
combined_writer.add_page(page)
except:
print(f"Failed to convert {input_file}")
pass
return combined_writer
def convert_zip(
input_file: Path, combine: bool, in_place: bool
) -> Optional[pypdf.PdfWriter]:
with tempfile.TemporaryDirectory() as tmpdirname:
tmpdir = Path(tmpdirname)
with zipfile.ZipFile(input_file, "r") as zip_ref:
zip_ref.extractall(tmpdir)
combined_writer = process_directory(tmpdir, combine, in_place)
return combined_writer
def convert(input_dir: Path, combine: bool, output_file: Path, in_place: bool) -> None:
def _convert():
if input_dir.suffix == ".zip":
return convert_zip(input_dir, combine, in_place)
else:
return process_directory(input_dir, combine, in_place)
combined_writer = _convert()
if combine and combined_writer is not None:
with open(output_file, "wb") as f:
combined_writer.write(f)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("-i", "--input_dir", type=Path, required=True)
parser.add_argument("-c", "--combine", action="store_true")
parser.add_argument(
"-o", "--output_file", type=Path, default="./combined.pdf", required=False
)
parser.add_argument("--in_place", action="store_true", default=True)
args = parser.parse_args()
convert(args.input_dir, args.combine, args.output_file, args.in_place)
# input_dir = Path("./data/bead_comments.zip")
# combine = True
# output_file = Path("./combined.pdf")
# in_place = True
# convert(input_dir, combine, output_file, in_place)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment