mkbabb · August 2, 2023 14:46
diff --git a/unfurl_pdf.py b/unfurl_pdf.py
 import argparse
 import tempfile
 import zipfile
 from io import BytesIO
 from pathlib import Path
 from typing import Optional

 import docx2pdf
 import pypdf
 from pypdf.generic import AnnotationBuilder


 def append_header(
    name: str,
    page: pypdf.PageObject,
    writer: pypdf.PdfWriter,
 ):
    annotation = AnnotationBuilder.free_text(
        name,
        rect=(
            page.mediabox.right,
            page.mediabox.top,
            page.mediabox.left,
            page.mediabox.bottom - 50,
        ),
        font="Arial",
        bold=True,
        italic=True,
        font_size="20pt",
        font_color="00ff00",
        border_color="0000ff",
    )
    writer.add_annotation(page, annotation)

    return page


 def convert_pdf(input_file: Path, in_place: bool) -> pypdf.PdfWriter:
    output_file = (
        input_file
        if in_place
        else input_file.with_stem(f"{input_file.stem}_normalized")
    )

    pdf = pypdf.PdfReader(input_file)
    writer = pypdf.PdfWriter()

    for n, page in enumerate(pdf.pages):
        if n == 0 and len(pdf.attachments):
            continue
        writer.add_page(page)

    for name, attachment in pdf.attachments.items():
        for b in attachment:
            try:
                with BytesIO(b) as f:
                    t_pdf = pypdf.PdfReader(f)

                    for n, page in enumerate(t_pdf.pages):
                        # if n == 0:
                        #     page = append_header(name, page, writer)

                        writer.add_page(page)
            except Exception as e:
                print(f"Failed to process attachment {name}: {e}")
                pass

    with open(output_file, "wb") as f:
        writer.write(f)

    return output_file


 def convert_docx(input_file: Path, in_place: bool) -> pypdf.PdfWriter:
    output_file = input_file.with_suffix(".pdf")
    docx2pdf.convert(input_file, output_file)

    if in_place:
        input_file.unlink()

    return output_file


 def convert_file(input_file: Path, in_place: bool) -> Optional[pypdf.PdfWriter]:
    path = None
    if input_file.suffix == ".pdf":
        path = convert_pdf(input_file, in_place)
    elif input_file.suffix == ".docx":
        path = convert_docx(input_file, in_place)

    if path is not None:
        writer = pypdf.PdfWriter()

        for page in pypdf.PdfReader(path).pages:
            writer.add_page(page)

        return writer


 def process_directory(
    input_dir: Path, combine: bool, in_place: bool
 ) -> Optional[pypdf.PdfWriter]:
    combined_writer = pypdf.PdfWriter() if combine else None

    for input_file in input_dir.glob("*"):
        try:
            converted_writer = convert_file(input_file, in_place)

            if combine and converted_writer is not None:
                for page in converted_writer.pages:
                    combined_writer.add_page(page)
        except:
            print(f"Failed to convert {input_file}")
            pass

    return combined_writer


 def convert_zip(
    input_file: Path, combine: bool, in_place: bool
 ) -> Optional[pypdf.PdfWriter]:
    with tempfile.TemporaryDirectory() as tmpdirname:
        tmpdir = Path(tmpdirname)

        with zipfile.ZipFile(input_file, "r") as zip_ref:
            zip_ref.extractall(tmpdir)

        combined_writer = process_directory(tmpdir, combine, in_place)

        return combined_writer


 def convert(input_dir: Path, combine: bool, output_file: Path, in_place: bool) -> None:
    def _convert():
        if input_dir.suffix == ".zip":
            return convert_zip(input_dir, combine, in_place)
        else:
            return process_directory(input_dir, combine, in_place)

    combined_writer = _convert()

    if combine and combined_writer is not None:
        with open(output_file, "wb") as f:
            combined_writer.write(f)


 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("-i", "--input_dir", type=Path, required=True)
    parser.add_argument("-c", "--combine", action="store_true")
    parser.add_argument(
        "-o", "--output_file", type=Path, default="./combined.pdf", required=False
    )
    parser.add_argument("--in_place", action="store_true", default=True)

    args = parser.parse_args()

    convert(args.input_dir, args.combine, args.output_file, args.in_place)

    # input_dir = Path("./data/bead_comments.zip")
    # combine = True
    # output_file = Path("./combined.pdf")
    # in_place = True

    # convert(input_dir, combine, output_file, in_place)
	import argparse
	import tempfile
	import zipfile
	from io import BytesIO
	from pathlib import Path
	from typing import Optional

	import docx2pdf
	import pypdf
	from pypdf.generic import AnnotationBuilder


	def append_header(
	name: str,
	page: pypdf.PageObject,
	writer: pypdf.PdfWriter,
	):
	annotation = AnnotationBuilder.free_text(
	name,
	rect=(
	page.mediabox.right,
	page.mediabox.top,
	page.mediabox.left,
	page.mediabox.bottom - 50,
	),
	font="Arial",
	bold=True,
	italic=True,
	font_size="20pt",
	font_color="00ff00",
	border_color="0000ff",
	)
	writer.add_annotation(page, annotation)

	return page


	def convert_pdf(input_file: Path, in_place: bool) -> pypdf.PdfWriter:
	output_file = (
	input_file
	if in_place
	else input_file.with_stem(f"{input_file.stem}_normalized")
	)

	pdf = pypdf.PdfReader(input_file)
	writer = pypdf.PdfWriter()

	for n, page in enumerate(pdf.pages):
	if n == 0 and len(pdf.attachments):
	continue
	writer.add_page(page)

	for name, attachment in pdf.attachments.items():
	for b in attachment:
	try:
	with BytesIO(b) as f:
	t_pdf = pypdf.PdfReader(f)

	for n, page in enumerate(t_pdf.pages):
	# if n == 0:
	# page = append_header(name, page, writer)

	writer.add_page(page)
	except Exception as e:
	print(f"Failed to process attachment {name}: {e}")
	pass

	with open(output_file, "wb") as f:
	writer.write(f)

	return output_file


	def convert_docx(input_file: Path, in_place: bool) -> pypdf.PdfWriter:
	output_file = input_file.with_suffix(".pdf")
	docx2pdf.convert(input_file, output_file)

	if in_place:
	input_file.unlink()

	return output_file


	def convert_file(input_file: Path, in_place: bool) -> Optional[pypdf.PdfWriter]:
	path = None
	if input_file.suffix == ".pdf":
	path = convert_pdf(input_file, in_place)
	elif input_file.suffix == ".docx":
	path = convert_docx(input_file, in_place)

	if path is not None:
	writer = pypdf.PdfWriter()

	for page in pypdf.PdfReader(path).pages:
	writer.add_page(page)

	return writer


	def process_directory(
	input_dir: Path, combine: bool, in_place: bool
	) -> Optional[pypdf.PdfWriter]:
	combined_writer = pypdf.PdfWriter() if combine else None

	for input_file in input_dir.glob("*"):
	try:
	converted_writer = convert_file(input_file, in_place)

	if combine and converted_writer is not None:
	for page in converted_writer.pages:
	combined_writer.add_page(page)
	except:
	print(f"Failed to convert {input_file}")
	pass

	return combined_writer


	def convert_zip(
	input_file: Path, combine: bool, in_place: bool
	) -> Optional[pypdf.PdfWriter]:
	with tempfile.TemporaryDirectory() as tmpdirname:
	tmpdir = Path(tmpdirname)

	with zipfile.ZipFile(input_file, "r") as zip_ref:
	zip_ref.extractall(tmpdir)

	combined_writer = process_directory(tmpdir, combine, in_place)

	return combined_writer


	def convert(input_dir: Path, combine: bool, output_file: Path, in_place: bool) -> None:
	def _convert():
	if input_dir.suffix == ".zip":
	return convert_zip(input_dir, combine, in_place)
	else:
	return process_directory(input_dir, combine, in_place)

	combined_writer = _convert()

	if combine and combined_writer is not None:
	with open(output_file, "wb") as f:
	combined_writer.write(f)


	if __name__ == "__main__":
	parser = argparse.ArgumentParser()
	parser.add_argument("-i", "--input_dir", type=Path, required=True)
	parser.add_argument("-c", "--combine", action="store_true")
	parser.add_argument(
	"-o", "--output_file", type=Path, default="./combined.pdf", required=False
	)
	parser.add_argument("--in_place", action="store_true", default=True)

	args = parser.parse_args()

	convert(args.input_dir, args.combine, args.output_file, args.in_place)

	# input_dir = Path("./data/bead_comments.zip")
	# combine = True
	# output_file = Path("./combined.pdf")
	# in_place = True

	# convert(input_dir, combine, output_file, in_place)
No results found