Created
February 12, 2025 03:38
-
-
Save nobucshirai/aeec9b42b4e51e8cb274d9e5bfaa26cf to your computer and use it in GitHub Desktop.
PDF Annotation Extractor – This script processes PDF files, detecting and extracting only the pages that contain annotations. Useful for reviewing highlighted or commented content.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
""" | |
Extract annotated pages from PDF files. | |
This script reads one or more PDF files, checks each page for annotations, | |
and writes a new PDF containing only those pages that contain annotations. | |
If an output filename is not provided, the script uses the input file's basename | |
but adds "_extracted" before the ".pdf" extension. | |
Before overwriting an existing file, the user is prompted for confirmation. | |
Usage examples: | |
Extract annotated pages from a single PDF and write to "output.pdf": | |
./extract_annotated_pages.py input.pdf -o output.pdf | |
Process multiple PDF files (each will be saved with its original basename plus "_extracted"): | |
./extract_annotated_pages.py input1.pdf input2.pdf | |
""" | |
import argparse | |
import os | |
import sys | |
from typing import List, Optional | |
from PyPDF2 import PdfReader, PdfWriter | |
from PyPDF2.errors import PdfReadError | |
def parse_arguments() -> argparse.Namespace: | |
""" | |
Parse command-line arguments. | |
Returns: | |
argparse.Namespace: Parsed arguments. | |
""" | |
parser = argparse.ArgumentParser( | |
description="Extract annotated pages from PDF files." | |
) | |
parser.add_argument( | |
"input_files", | |
metavar="INPUT", | |
type=str, | |
nargs="+", | |
help="Input PDF file(s) to process." | |
) | |
parser.add_argument( | |
"-o", | |
"--output", | |
type=str, | |
default=None, | |
help=( | |
"Specify output filename. Only allowed when a single input file " | |
"is provided. If not specified, the default is the input file's " | |
"basename plus '_extracted' before the extension." | |
) | |
) | |
return parser.parse_args() | |
def confirm_overwrite(filename: str) -> bool: | |
""" | |
Ask the user for confirmation to overwrite an existing file. | |
Args: | |
filename (str): The name of the file to potentially overwrite. | |
Returns: | |
bool: True if user confirms overwrite, False otherwise. | |
""" | |
while True: | |
response = input(f'File "{filename}" already exists. Overwrite? (y)es/(n)o [n]: ').strip().lower() | |
if response == "" or response.startswith("n"): | |
return False | |
elif response.startswith("y"): | |
return True | |
else: | |
print("Please respond with 'y' or 'n'.") | |
def extract_annotated_pages(input_file: str) -> Optional[PdfWriter]: | |
""" | |
Extract pages with annotations from the given PDF file. | |
Args: | |
input_file (str): Path to the input PDF file. | |
Returns: | |
Optional[PdfWriter]: A PdfWriter object containing only the pages | |
with annotations, or None if no such pages exist. | |
""" | |
try: | |
reader = PdfReader(input_file) | |
except PdfReadError as e: | |
print(f"Error reading {input_file}: {e}") | |
return None | |
writer = PdfWriter() | |
annotated_found = False | |
for i, page in enumerate(reader.pages): | |
# Get the /Annots entry; it might be absent or empty. | |
annots = page.get("/Annots") | |
if annots: | |
# Some annotations might be a list-like object. | |
try: | |
if len(annots) > 0: | |
writer.add_page(page) | |
annotated_found = True | |
except TypeError: | |
# In case 'annots' is not a list. | |
writer.add_page(page) | |
annotated_found = True | |
if not annotated_found: | |
return None | |
return writer | |
def process_file(input_file: str, output_file: Optional[str]) -> None: | |
""" | |
Process a single PDF file by extracting annotated pages and writing them to an output file. | |
Args: | |
input_file (str): Path to the input PDF file. | |
output_file (Optional[str]): Output filename. If None, uses the input file's basename plus '_extracted'. | |
""" | |
if not os.path.isfile(input_file): | |
print(f"Error: File '{input_file}' not found.") | |
return | |
print(f"Processing '{input_file}'...") | |
writer = extract_annotated_pages(input_file) | |
if writer is None: | |
print(f"No annotated pages found in '{input_file}'. Skipping.") | |
return | |
# Derive default output filename if not provided | |
if output_file is None: | |
base, ext = os.path.splitext(os.path.basename(input_file)) | |
# Insert "_extracted" before the extension | |
output_file = f"{base}_extracted{ext}" | |
# Check for file overwrite. | |
if os.path.exists(output_file): | |
if not confirm_overwrite(output_file): | |
print(f"Skipping writing to '{output_file}'.") | |
return | |
try: | |
with open(output_file, "wb") as out_f: | |
writer.write(out_f) | |
print(f"Annotated pages written to '{output_file}'.") | |
except Exception as e: | |
print(f"Error writing to '{output_file}': {e}") | |
def main() -> None: | |
""" | |
Main function to parse arguments and process PDF files. | |
""" | |
args = parse_arguments() | |
# If an output filename is specified, ensure only one input file is given. | |
if args.output and len(args.input_files) > 1: | |
print("Error: When using the -o/--output option, only one input file may be specified.", file=sys.stderr) | |
sys.exit(1) | |
for input_file in args.input_files: | |
# Use the specified output file only for the single input scenario. | |
output_file = args.output if args.output else None | |
process_file(input_file, output_file) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment