Last active
September 2, 2024 08:34
-
-
Save alsiesta/07dcded8b0a2e05c0306d0922ad5c9bb to your computer and use it in GitHub Desktop.
This script processes a range of PDF files and identifies those with a character count exceeding a specified threshold.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# ------------------------------------------------------------------ | |
# Script Name: pdfcwcount_range.py | |
# Description: This script processes a range of PDF files and | |
# identifies those with a character count exceeding | |
# a specified threshold. | |
# Website: https://gist.github.com/ostechnix | |
# Version: 1.0 | |
# Usage: py pdf_processor.py <range_param> <char_threshold> | |
# in a PDF File. | |
# Use Gist: curl -s https://gist.githubusercontent.com/alsiesta/07dcded8b0a2e05c0306d0922ad5c9bb/raw/countcharacter_in_pdf_range.py | py - "output_chunk_1.pdf-output_chunk_30.pdf" 40000 | |
# ------------------------------------------------------------------ | |
import sys | |
import os | |
from PyPDF2 import PdfReader | |
def count_characters_in_pdf(pdf_path): | |
reader = PdfReader(pdf_path) | |
text = "" | |
for page in reader.pages: | |
text += page.extract_text() or "" | |
return len(text) | |
def generate_pdf_list(range_param): | |
start_pdf, end_pdf = range_param.split('-') | |
start_index = int(start_pdf.split('_')[-1].replace('.pdf', '')) | |
end_index = int(end_pdf.split('_')[-1].replace('.pdf', '')) | |
prefix = start_pdf.rsplit('_', 1)[0] # Extract the prefix (e.g., "output_chunk") | |
pdf_list = [f"{prefix}_{i}.pdf" for i in range(start_index, end_index + 1)] | |
return pdf_list | |
def process_pdfs(input_pdfs, char_threshold): | |
pdfs_over_threshold = [] | |
for pdf_path in input_pdfs: | |
if os.path.isfile(pdf_path): | |
char_count = count_characters_in_pdf(pdf_path) | |
if char_count > char_threshold: | |
pdfs_over_threshold.append((pdf_path, char_count)) | |
else: | |
print(f"File not found: {pdf_path}") | |
if pdfs_over_threshold: | |
return pdfs_over_threshold | |
else: | |
return f"No PDF has more than {char_threshold} characters" | |
if __name__ == "__main__": | |
if len(sys.argv) != 3: | |
print("Usage: py pdf_processor.py <range_param> <char_threshold>") | |
sys.exit(1) | |
range_param = sys.argv[1] | |
char_threshold = int(sys.argv[2]) # Character count threshold passed as a command-line argument | |
input_pdfs = generate_pdf_list(range_param) | |
result = process_pdfs(input_pdfs, char_threshold) | |
if isinstance(result, list): | |
for pdf, count in result: | |
print(f"PDF: {pdf} has {count} characters.") | |
else: | |
print(result) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment