Skip to content

Instantly share code, notes, and snippets.

@nobucshirai
Created February 12, 2025 03:38
Show Gist options
  • Save nobucshirai/aeec9b42b4e51e8cb274d9e5bfaa26cf to your computer and use it in GitHub Desktop.
Save nobucshirai/aeec9b42b4e51e8cb274d9e5bfaa26cf to your computer and use it in GitHub Desktop.
PDF Annotation Extractor – This script processes PDF files, detecting and extracting only the pages that contain annotations. Useful for reviewing highlighted or commented content.
#!/usr/bin/env python3
"""
Extract annotated pages from PDF files.
This script reads one or more PDF files, checks each page for annotations,
and writes a new PDF containing only those pages that contain annotations.
If an output filename is not provided, the script uses the input file's basename
but adds "_extracted" before the ".pdf" extension.
Before overwriting an existing file, the user is prompted for confirmation.
Usage examples:
Extract annotated pages from a single PDF and write to "output.pdf":
./extract_annotated_pages.py input.pdf -o output.pdf
Process multiple PDF files (each will be saved with its original basename plus "_extracted"):
./extract_annotated_pages.py input1.pdf input2.pdf
"""
import argparse
import os
import sys
from typing import List, Optional
from PyPDF2 import PdfReader, PdfWriter
from PyPDF2.errors import PdfReadError
def parse_arguments() -> argparse.Namespace:
"""
Parse command-line arguments.
Returns:
argparse.Namespace: Parsed arguments.
"""
parser = argparse.ArgumentParser(
description="Extract annotated pages from PDF files."
)
parser.add_argument(
"input_files",
metavar="INPUT",
type=str,
nargs="+",
help="Input PDF file(s) to process."
)
parser.add_argument(
"-o",
"--output",
type=str,
default=None,
help=(
"Specify output filename. Only allowed when a single input file "
"is provided. If not specified, the default is the input file's "
"basename plus '_extracted' before the extension."
)
)
return parser.parse_args()
def confirm_overwrite(filename: str) -> bool:
"""
Ask the user for confirmation to overwrite an existing file.
Args:
filename (str): The name of the file to potentially overwrite.
Returns:
bool: True if user confirms overwrite, False otherwise.
"""
while True:
response = input(f'File "{filename}" already exists. Overwrite? (y)es/(n)o [n]: ').strip().lower()
if response == "" or response.startswith("n"):
return False
elif response.startswith("y"):
return True
else:
print("Please respond with 'y' or 'n'.")
def extract_annotated_pages(input_file: str) -> Optional[PdfWriter]:
"""
Extract pages with annotations from the given PDF file.
Args:
input_file (str): Path to the input PDF file.
Returns:
Optional[PdfWriter]: A PdfWriter object containing only the pages
with annotations, or None if no such pages exist.
"""
try:
reader = PdfReader(input_file)
except PdfReadError as e:
print(f"Error reading {input_file}: {e}")
return None
writer = PdfWriter()
annotated_found = False
for i, page in enumerate(reader.pages):
# Get the /Annots entry; it might be absent or empty.
annots = page.get("/Annots")
if annots:
# Some annotations might be a list-like object.
try:
if len(annots) > 0:
writer.add_page(page)
annotated_found = True
except TypeError:
# In case 'annots' is not a list.
writer.add_page(page)
annotated_found = True
if not annotated_found:
return None
return writer
def process_file(input_file: str, output_file: Optional[str]) -> None:
"""
Process a single PDF file by extracting annotated pages and writing them to an output file.
Args:
input_file (str): Path to the input PDF file.
output_file (Optional[str]): Output filename. If None, uses the input file's basename plus '_extracted'.
"""
if not os.path.isfile(input_file):
print(f"Error: File '{input_file}' not found.")
return
print(f"Processing '{input_file}'...")
writer = extract_annotated_pages(input_file)
if writer is None:
print(f"No annotated pages found in '{input_file}'. Skipping.")
return
# Derive default output filename if not provided
if output_file is None:
base, ext = os.path.splitext(os.path.basename(input_file))
# Insert "_extracted" before the extension
output_file = f"{base}_extracted{ext}"
# Check for file overwrite.
if os.path.exists(output_file):
if not confirm_overwrite(output_file):
print(f"Skipping writing to '{output_file}'.")
return
try:
with open(output_file, "wb") as out_f:
writer.write(out_f)
print(f"Annotated pages written to '{output_file}'.")
except Exception as e:
print(f"Error writing to '{output_file}': {e}")
def main() -> None:
"""
Main function to parse arguments and process PDF files.
"""
args = parse_arguments()
# If an output filename is specified, ensure only one input file is given.
if args.output and len(args.input_files) > 1:
print("Error: When using the -o/--output option, only one input file may be specified.", file=sys.stderr)
sys.exit(1)
for input_file in args.input_files:
# Use the specified output file only for the single input scenario.
output_file = args.output if args.output else None
process_file(input_file, output_file)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment