Skip to content

Instantly share code, notes, and snippets.

@alsiesta
Last active September 2, 2024 08:34
Show Gist options
  • Save alsiesta/07dcded8b0a2e05c0306d0922ad5c9bb to your computer and use it in GitHub Desktop.
Save alsiesta/07dcded8b0a2e05c0306d0922ad5c9bb to your computer and use it in GitHub Desktop.
This script processes a range of PDF files and identifies those with a character count exceeding a specified threshold.
#!/usr/bin/env python3
# ------------------------------------------------------------------
# Script Name: pdfcwcount_range.py
# Description: This script processes a range of PDF files and
# identifies those with a character count exceeding
# a specified threshold.
# Website: https://gist.github.com/ostechnix
# Version: 1.0
# Usage: py pdf_processor.py <range_param> <char_threshold>
# in a PDF File.
# Use Gist: curl -s https://gist.githubusercontent.com/alsiesta/07dcded8b0a2e05c0306d0922ad5c9bb/raw/countcharacter_in_pdf_range.py | py - "output_chunk_1.pdf-output_chunk_30.pdf" 40000
# ------------------------------------------------------------------
import sys
import os
from PyPDF2 import PdfReader
def count_characters_in_pdf(pdf_path):
reader = PdfReader(pdf_path)
text = ""
for page in reader.pages:
text += page.extract_text() or ""
return len(text)
def generate_pdf_list(range_param):
start_pdf, end_pdf = range_param.split('-')
start_index = int(start_pdf.split('_')[-1].replace('.pdf', ''))
end_index = int(end_pdf.split('_')[-1].replace('.pdf', ''))
prefix = start_pdf.rsplit('_', 1)[0] # Extract the prefix (e.g., "output_chunk")
pdf_list = [f"{prefix}_{i}.pdf" for i in range(start_index, end_index + 1)]
return pdf_list
def process_pdfs(input_pdfs, char_threshold):
pdfs_over_threshold = []
for pdf_path in input_pdfs:
if os.path.isfile(pdf_path):
char_count = count_characters_in_pdf(pdf_path)
if char_count > char_threshold:
pdfs_over_threshold.append((pdf_path, char_count))
else:
print(f"File not found: {pdf_path}")
if pdfs_over_threshold:
return pdfs_over_threshold
else:
return f"No PDF has more than {char_threshold} characters"
if __name__ == "__main__":
if len(sys.argv) != 3:
print("Usage: py pdf_processor.py <range_param> <char_threshold>")
sys.exit(1)
range_param = sys.argv[1]
char_threshold = int(sys.argv[2]) # Character count threshold passed as a command-line argument
input_pdfs = generate_pdf_list(range_param)
result = process_pdfs(input_pdfs, char_threshold)
if isinstance(result, list):
for pdf, count in result:
print(f"PDF: {pdf} has {count} characters.")
else:
print(result)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment