Skip to content

Instantly share code, notes, and snippets.

@alsiesta
Last active September 2, 2024 11:18
Show Gist options
  • Save alsiesta/6ad888bece95198157f395553b462f85 to your computer and use it in GitHub Desktop.
Save alsiesta/6ad888bece95198157f395553b462f85 to your computer and use it in GitHub Desktop.
Chunks a PDF into chunks with a specific chunk_size n pages and n overlap per chunk
#!/usr/bin/env python3
# ------------------------------------------------------------------
# Script Name: chunk_pdf_with_overlap.py
# Description: This script processes one PDF file and chunks it into
# chunks of chunk_size=n pages with a overlap of
# overlap=n
# Website: https://gist.github.com/alsiesta
# Version: 1.0
# Usage: py chunk_pdf_with_overlap.py <input_pdf> <output_prefix>
# Example: py countcharacter_in_pdf_range.py mydocument.pdf mydocprefix
# Use Gist: curl -s https://gist.githubusercontent.com/alsiesta/6ad888bece95198157f395553b462f85/raw/chunk_pdf_with_overlap.py | py - 2BSales.pdf gistdoc
# ------------------------------------------------------------------
import sys
from PyPDF2 import PdfReader, PdfWriter
# Define chunk size and overlap
chunk_size = 5 # Number of pages in each chunk
overlap = 1 # Number of overlapping pages
def chunk_pdf_with_overlap(input_pdf, chunk_size, overlap, output_prefix):
reader = PdfReader(input_pdf)
total_pages = len(reader.pages)
start_page = 0
chunk_number = 1
while start_page < total_pages:
end_page = min(start_page + chunk_size - 1, total_pages - 1)
writer = PdfWriter()
for i in range(start_page, end_page + 1):
writer.add_page(reader.pages[i])
output_pdf = f"{output_prefix}_chunk_{chunk_number}.pdf"
with open(output_pdf, "wb") as out_file:
writer.write(out_file)
start_page = start_page + chunk_size - overlap
chunk_number += 1
if __name__ == "__main__":
input_pdf = sys.argv[1]
output_prefix = sys.argv[2]
chunk_pdf_with_overlap(input_pdf, chunk_size, overlap, output_prefix)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment