Last active
September 2, 2024 11:18
-
-
Save alsiesta/6ad888bece95198157f395553b462f85 to your computer and use it in GitHub Desktop.
Chunks a PDF into chunks with a specific chunk_size n pages and n overlap per chunk
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# ------------------------------------------------------------------ | |
# Script Name: chunk_pdf_with_overlap.py | |
# Description: This script processes one PDF file and chunks it into | |
# chunks of chunk_size=n pages with a overlap of | |
# overlap=n | |
# Website: https://gist.github.com/alsiesta | |
# Version: 1.0 | |
# Usage: py chunk_pdf_with_overlap.py <input_pdf> <output_prefix> | |
# Example: py countcharacter_in_pdf_range.py mydocument.pdf mydocprefix | |
# Use Gist: curl -s https://gist.githubusercontent.com/alsiesta/6ad888bece95198157f395553b462f85/raw/chunk_pdf_with_overlap.py | py - 2BSales.pdf gistdoc | |
# ------------------------------------------------------------------ | |
import sys | |
from PyPDF2 import PdfReader, PdfWriter | |
# Define chunk size and overlap | |
chunk_size = 5 # Number of pages in each chunk | |
overlap = 1 # Number of overlapping pages | |
def chunk_pdf_with_overlap(input_pdf, chunk_size, overlap, output_prefix): | |
reader = PdfReader(input_pdf) | |
total_pages = len(reader.pages) | |
start_page = 0 | |
chunk_number = 1 | |
while start_page < total_pages: | |
end_page = min(start_page + chunk_size - 1, total_pages - 1) | |
writer = PdfWriter() | |
for i in range(start_page, end_page + 1): | |
writer.add_page(reader.pages[i]) | |
output_pdf = f"{output_prefix}_chunk_{chunk_number}.pdf" | |
with open(output_pdf, "wb") as out_file: | |
writer.write(out_file) | |
start_page = start_page + chunk_size - overlap | |
chunk_number += 1 | |
if __name__ == "__main__": | |
input_pdf = sys.argv[1] | |
output_prefix = sys.argv[2] | |
chunk_pdf_with_overlap(input_pdf, chunk_size, overlap, output_prefix) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment