alsiesta · September 2, 2024 08:34
diff --git a/countcharacter_in_pdf_range.py b/countcharacter_in_pdf_range.py
 #!/usr/bin/env python3

 # ------------------------------------------------------------------
 # Script Name:   pdfcwcount_range.py
 # Description:   This script processes a range of PDF files and
 #                identifies those with a character count exceeding
 #                a specified threshold.
 # Website:       https://gist.github.com/ostechnix
 # Version:       1.0
 # Usage:         py pdf_processor.py <range_param> <char_threshold>
 #                in a PDF File.
 # Use Gist:      curl -s https://gist.githubusercontent.com/alsiesta/07dcded8b0a2e05c0306d0922ad5c9bb/raw/countcharacter_in_pdf_range.py | py - "output_chunk_1.pdf-output_chunk_30.pdf" 40000
 # ------------------------------------------------------------------

 import sys
 import os
 from PyPDF2 import PdfReader

 def count_characters_in_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text() or ""
    return len(text)

 def generate_pdf_list(range_param):
    start_pdf, end_pdf = range_param.split('-')
    
    start_index = int(start_pdf.split('_')[-1].replace('.pdf', ''))
    end_index = int(end_pdf.split('_')[-1].replace('.pdf', ''))
    
    prefix = start_pdf.rsplit('_', 1)[0]  # Extract the prefix (e.g., "output_chunk")

    pdf_list = [f"{prefix}_{i}.pdf" for i in range(start_index, end_index + 1)]
    return pdf_list

 def process_pdfs(input_pdfs, char_threshold):
    pdfs_over_threshold = []
    for pdf_path in input_pdfs:
        if os.path.isfile(pdf_path):
            char_count = count_characters_in_pdf(pdf_path)
            if char_count > char_threshold:
                pdfs_over_threshold.append((pdf_path, char_count))
        else:
            print(f"File not found: {pdf_path}")
    
    if pdfs_over_threshold:
        return pdfs_over_threshold
    else:
        return f"No PDF has more than {char_threshold} characters"

 if __name__ == "__main__":
    if len(sys.argv) != 3:
        print("Usage: py pdf_processor.py <range_param> <char_threshold>")
        sys.exit(1)
    
    range_param = sys.argv[1]
    char_threshold = int(sys.argv[2])  # Character count threshold passed as a command-line argument
    input_pdfs = generate_pdf_list(range_param)
    
    result = process_pdfs(input_pdfs, char_threshold)
    
    if isinstance(result, list):
        for pdf, count in result:
            print(f"PDF: {pdf} has {count} characters.")
    else:
        print(result)
	#!/usr/bin/env python3

	# ------------------------------------------------------------------
	# Script Name: pdfcwcount_range.py
	# Description: This script processes a range of PDF files and
	# identifies those with a character count exceeding
	# a specified threshold.
	# Website: https://gist.github.com/ostechnix
	# Version: 1.0
	# Usage: py pdf_processor.py <range_param> <char_threshold>
	# in a PDF File.
	# Use Gist: curl -s https://gist.githubusercontent.com/alsiesta/07dcded8b0a2e05c0306d0922ad5c9bb/raw/countcharacter_in_pdf_range.py \| py - "output_chunk_1.pdf-output_chunk_30.pdf" 40000
	# ------------------------------------------------------------------

	import sys
	import os
	from PyPDF2 import PdfReader

	def count_characters_in_pdf(pdf_path):
	reader = PdfReader(pdf_path)
	text = ""
	for page in reader.pages:
	text += page.extract_text() or ""
	return len(text)

	def generate_pdf_list(range_param):
	start_pdf, end_pdf = range_param.split('-')

	start_index = int(start_pdf.split('_')[-1].replace('.pdf', ''))
	end_index = int(end_pdf.split('_')[-1].replace('.pdf', ''))

	prefix = start_pdf.rsplit('_', 1)[0] # Extract the prefix (e.g., "output_chunk")

	pdf_list = [f"{prefix}_{i}.pdf" for i in range(start_index, end_index + 1)]
	return pdf_list

	def process_pdfs(input_pdfs, char_threshold):
	pdfs_over_threshold = []
	for pdf_path in input_pdfs:
	if os.path.isfile(pdf_path):
	char_count = count_characters_in_pdf(pdf_path)
	if char_count > char_threshold:
	pdfs_over_threshold.append((pdf_path, char_count))
	else:
	print(f"File not found: {pdf_path}")

	if pdfs_over_threshold:
	return pdfs_over_threshold
	else:
	return f"No PDF has more than {char_threshold} characters"

	if __name__ == "__main__":
	if len(sys.argv) != 3:
	print("Usage: py pdf_processor.py <range_param> <char_threshold>")
	sys.exit(1)

	range_param = sys.argv[1]
	char_threshold = int(sys.argv[2]) # Character count threshold passed as a command-line argument
	input_pdfs = generate_pdf_list(range_param)

	result = process_pdfs(input_pdfs, char_threshold)

	if isinstance(result, list):
	for pdf, count in result:
	print(f"PDF: {pdf} has {count} characters.")
	else:
	print(result)