nobucshirai · February 12, 2025 03:38
diff --git a/annotated_pages_extractor.py b/annotated_pages_extractor.py
 #!/usr/bin/env python3
 """
 Extract annotated pages from PDF files.

 This script reads one or more PDF files, checks each page for annotations,
 and writes a new PDF containing only those pages that contain annotations.
 If an output filename is not provided, the script uses the input file's basename
 but adds "_extracted" before the ".pdf" extension.

 Before overwriting an existing file, the user is prompted for confirmation.

 Usage examples:
    Extract annotated pages from a single PDF and write to "output.pdf":
        ./extract_annotated_pages.py input.pdf -o output.pdf

    Process multiple PDF files (each will be saved with its original basename plus "_extracted"):
        ./extract_annotated_pages.py input1.pdf input2.pdf
 """

 import argparse
 import os
 import sys
 from typing import List, Optional

 from PyPDF2 import PdfReader, PdfWriter
 from PyPDF2.errors import PdfReadError


 def parse_arguments() -> argparse.Namespace:
    """
    Parse command-line arguments.

    Returns:
        argparse.Namespace: Parsed arguments.
    """
    parser = argparse.ArgumentParser(
        description="Extract annotated pages from PDF files."
    )
    parser.add_argument(
        "input_files",
        metavar="INPUT",
        type=str,
        nargs="+",
        help="Input PDF file(s) to process."
    )
    parser.add_argument(
        "-o",
        "--output",
        type=str,
        default=None,
        help=(
            "Specify output filename. Only allowed when a single input file "
            "is provided. If not specified, the default is the input file's "
            "basename plus '_extracted' before the extension."
        )
    )
    return parser.parse_args()


 def confirm_overwrite(filename: str) -> bool:
    """
    Ask the user for confirmation to overwrite an existing file.

    Args:
        filename (str): The name of the file to potentially overwrite.

    Returns:
        bool: True if user confirms overwrite, False otherwise.
    """
    while True:
        response = input(f'File "{filename}" already exists. Overwrite? (y)es/(n)o [n]: ').strip().lower()
        if response == "" or response.startswith("n"):
            return False
        elif response.startswith("y"):
            return True
        else:
            print("Please respond with 'y' or 'n'.")


 def extract_annotated_pages(input_file: str) -> Optional[PdfWriter]:
    """
    Extract pages with annotations from the given PDF file.

    Args:
        input_file (str): Path to the input PDF file.

    Returns:
        Optional[PdfWriter]: A PdfWriter object containing only the pages
                             with annotations, or None if no such pages exist.
    """
    try:
        reader = PdfReader(input_file)
    except PdfReadError as e:
        print(f"Error reading {input_file}: {e}")
        return None

    writer = PdfWriter()
    annotated_found = False

    for i, page in enumerate(reader.pages):
        # Get the /Annots entry; it might be absent or empty.
        annots = page.get("/Annots")
        if annots:
            # Some annotations might be a list-like object.
            try:
                if len(annots) > 0:
                    writer.add_page(page)
                    annotated_found = True
            except TypeError:
                # In case 'annots' is not a list.
                writer.add_page(page)
                annotated_found = True

    if not annotated_found:
        return None

    return writer


 def process_file(input_file: str, output_file: Optional[str]) -> None:
    """
    Process a single PDF file by extracting annotated pages and writing them to an output file.

    Args:
        input_file (str): Path to the input PDF file.
        output_file (Optional[str]): Output filename. If None, uses the input file's basename plus '_extracted'.
    """
    if not os.path.isfile(input_file):
        print(f"Error: File '{input_file}' not found.")
        return

    print(f"Processing '{input_file}'...")
    writer = extract_annotated_pages(input_file)
    if writer is None:
        print(f"No annotated pages found in '{input_file}'. Skipping.")
        return

    # Derive default output filename if not provided
    if output_file is None:
        base, ext = os.path.splitext(os.path.basename(input_file))
        # Insert "_extracted" before the extension
        output_file = f"{base}_extracted{ext}"

    # Check for file overwrite.
    if os.path.exists(output_file):
        if not confirm_overwrite(output_file):
            print(f"Skipping writing to '{output_file}'.")
            return

    try:
        with open(output_file, "wb") as out_f:
            writer.write(out_f)
        print(f"Annotated pages written to '{output_file}'.")
    except Exception as e:
        print(f"Error writing to '{output_file}': {e}")


 def main() -> None:
    """
    Main function to parse arguments and process PDF files.
    """
    args = parse_arguments()

    # If an output filename is specified, ensure only one input file is given.
    if args.output and len(args.input_files) > 1:
        print("Error: When using the -o/--output option, only one input file may be specified.", file=sys.stderr)
        sys.exit(1)

    for input_file in args.input_files:
        # Use the specified output file only for the single input scenario.
        output_file = args.output if args.output else None
        process_file(input_file, output_file)


 if __name__ == "__main__":
    main()
	#!/usr/bin/env python3
	"""
	Extract annotated pages from PDF files.

	This script reads one or more PDF files, checks each page for annotations,
	and writes a new PDF containing only those pages that contain annotations.
	If an output filename is not provided, the script uses the input file's basename
	but adds "_extracted" before the ".pdf" extension.

	Before overwriting an existing file, the user is prompted for confirmation.

	Usage examples:
	Extract annotated pages from a single PDF and write to "output.pdf":
	./extract_annotated_pages.py input.pdf -o output.pdf

	Process multiple PDF files (each will be saved with its original basename plus "_extracted"):
	./extract_annotated_pages.py input1.pdf input2.pdf
	"""

	import argparse
	import os
	import sys
	from typing import List, Optional

	from PyPDF2 import PdfReader, PdfWriter
	from PyPDF2.errors import PdfReadError


	def parse_arguments() -> argparse.Namespace:
	"""
	Parse command-line arguments.

	Returns:
	argparse.Namespace: Parsed arguments.
	"""
	parser = argparse.ArgumentParser(
	description="Extract annotated pages from PDF files."
	)
	parser.add_argument(
	"input_files",
	metavar="INPUT",
	type=str,
	nargs="+",
	help="Input PDF file(s) to process."
	)
	parser.add_argument(
	"-o",
	"--output",
	type=str,
	default=None,
	help=(
	"Specify output filename. Only allowed when a single input file "
	"is provided. If not specified, the default is the input file's "
	"basename plus '_extracted' before the extension."
	)
	)
	return parser.parse_args()


	def confirm_overwrite(filename: str) -> bool:
	"""
	Ask the user for confirmation to overwrite an existing file.

	Args:
	filename (str): The name of the file to potentially overwrite.

	Returns:
	bool: True if user confirms overwrite, False otherwise.
	"""
	while True:
	response = input(f'File "{filename}" already exists. Overwrite? (y)es/(n)o [n]: ').strip().lower()
	if response == "" or response.startswith("n"):
	return False
	elif response.startswith("y"):
	return True
	else:
	print("Please respond with 'y' or 'n'.")


	def extract_annotated_pages(input_file: str) -> Optional[PdfWriter]:
	"""
	Extract pages with annotations from the given PDF file.

	Args:
	input_file (str): Path to the input PDF file.

	Returns:
	Optional[PdfWriter]: A PdfWriter object containing only the pages
	with annotations, or None if no such pages exist.
	"""
	try:
	reader = PdfReader(input_file)
	except PdfReadError as e:
	print(f"Error reading {input_file}: {e}")
	return None

	writer = PdfWriter()
	annotated_found = False

	for i, page in enumerate(reader.pages):
	# Get the /Annots entry; it might be absent or empty.
	annots = page.get("/Annots")
	if annots:
	# Some annotations might be a list-like object.
	try:
	if len(annots) > 0:
	writer.add_page(page)
	annotated_found = True
	except TypeError:
	# In case 'annots' is not a list.
	writer.add_page(page)
	annotated_found = True

	if not annotated_found:
	return None

	return writer


	def process_file(input_file: str, output_file: Optional[str]) -> None:
	"""
	Process a single PDF file by extracting annotated pages and writing them to an output file.

	Args:
	input_file (str): Path to the input PDF file.
	output_file (Optional[str]): Output filename. If None, uses the input file's basename plus '_extracted'.
	"""
	if not os.path.isfile(input_file):
	print(f"Error: File '{input_file}' not found.")
	return

	print(f"Processing '{input_file}'...")
	writer = extract_annotated_pages(input_file)
	if writer is None:
	print(f"No annotated pages found in '{input_file}'. Skipping.")
	return

	# Derive default output filename if not provided
	if output_file is None:
	base, ext = os.path.splitext(os.path.basename(input_file))
	# Insert "_extracted" before the extension
	output_file = f"{base}_extracted{ext}"

	# Check for file overwrite.
	if os.path.exists(output_file):
	if not confirm_overwrite(output_file):
	print(f"Skipping writing to '{output_file}'.")
	return

	try:
	with open(output_file, "wb") as out_f:
	writer.write(out_f)
	print(f"Annotated pages written to '{output_file}'.")
	except Exception as e:
	print(f"Error writing to '{output_file}': {e}")


	def main() -> None:
	"""
	Main function to parse arguments and process PDF files.
	"""
	args = parse_arguments()

	# If an output filename is specified, ensure only one input file is given.
	if args.output and len(args.input_files) > 1:
	print("Error: When using the -o/--output option, only one input file may be specified.", file=sys.stderr)
	sys.exit(1)

	for input_file in args.input_files:
	# Use the specified output file only for the single input scenario.
	output_file = args.output if args.output else None
	process_file(input_file, output_file)


	if __name__ == "__main__":
	main()