DonRichards · November 19, 2024 17:52
diff --git a/README.md b/README.md
diff --git a/Dockerfile b/Dockerfile
 # Use an official Python runtime as a parent image
 FROM python:3.10-slim

 # Set environment variables
 ENV PYTHONUNBUFFERED=1

 # Install dependencies
 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
    tesseract-ocr \
    libtesseract-dev \
    poppler-utils \
    libjpeg-dev \
    libopenjp2-7-dev \
    libfreetype6-dev \
    liblcms2-dev \
    libharfbuzz-dev \
    libfribidi-dev \
    libxcb1-dev \
    ghostscript && \
    pip install --no-cache-dir \
    pytesseract \
    PyMuPDF \
    pikepdf \
    Pillow \
    requests && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*

 # Create a non-root user and switch to it
 RUN useradd -ms /bin/bash appuser
 USER appuser

 # Create and set the working directory
 WORKDIR /home/appuser/app

 # Copy the current directory contents into the container at /home/appuser/app
 COPY --chown=appuser:appuser . .

 # Set the CMD to launch a bash shell in the working directory
 CMD ["bash"]
diff --git a/process_pdf_for_compliance.py b/process_pdf_for_compliance.py
 import os
 import sys
 import logging
 import fitz  # PyMuPDF
 import pikepdf
 from pikepdf import Pdf, Name, Dictionary, Array
 import io
 from PIL import Image
 import pytesseract
 from pytesseract import Output
 import requests

 # Set up logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

 def download_file(url, output_path):
    try:
        response = requests.get(url, allow_redirects=True)
        response.raise_for_status()  # Ensure we notice bad responses
        with open(output_path, 'wb') as file:
            file.write(response.content)
        return output_path
    except Exception as e:
        logging.error(f"Failed to download the file: {e}")
        sys.exit(1)

 def check_file_exists(file_path):
    if not os.path.isfile(file_path):
        logging.error(f"File does not exist: {file_path}")
        sys.exit(1)

 def analyze_pdf_structure(pdf_path):
    """Analyze and log the structure of the PDF"""
    doc = fitz.open(pdf_path)
    logging.info(f"PDF Analysis for {pdf_path}:")
    logging.info(f"Number of pages: {len(doc)}")
    for page_num in range(len(doc)):
        page = doc[page_num]
        logging.info(f"Page {page_num + 1}:")
        logging.info(f"  Text length: {len(page.get_text())}")
    doc.close()

 def process_page(page):
    """Process a single page for OCR"""
    try:
        pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        ocr_result = pytesseract.image_to_data(img, output_type=Output.DICT, config='--psm 6')
        text = " ".join(word for word in ocr_result['text'] if word.strip())
        return text
    except Exception as e:
        logging.error(f"Error processing page: {e}")
        return ""

 def add_accessibility_features(pdf_path, output_directory):
    """Add accessibility features to the PDF"""
    doc = fitz.open(pdf_path)
    for page_num in range(len(doc)):
        page = doc[page_num]
        try:
            ocr_text = process_page(page)
            if ocr_text:
                page.insert_text((10, 10), ocr_text, fontsize=1, color=(1, 1, 1))
        except Exception as e:
            logging.error(f"Error processing page {page_num + 1}: {e}")
    
    # Save the modified PDF in the output directory
    output_filename = os.path.basename(pdf_path).replace('.pdf', '_Service_File.pdf')
    output_path = os.path.join(output_directory, output_filename)
    doc.save(output_path, garbage=4, deflate=True)
    doc.close()
    logging.info(f"Saved Service_File PDF as: {output_path}")
    return output_path

 def add_document_structure(pdf_path):
    """Add document structure and tags to the PDF"""
    try:
        with pikepdf.Pdf.open(pdf_path, allow_overwriting_input=True) as pdf:
            if "/StructTreeRoot" not in pdf.Root:
                pdf.Root["/StructTreeRoot"] = pikepdf.Dictionary({
                    "/Type": "/StructTreeRoot",
                    "/K": pikepdf.Array(),
                    "/ParentTree": pikepdf.Dictionary({"/Nums": pikepdf.Array()}),
                    "/ParentTreeNextKey": 0
                })
            
            if "/Lang" not in pdf.Root:
                pdf.Root["/Lang"] = 'en-US'
            
            with pdf.open_metadata() as meta:
                meta["dc:title"] = "Document Title"
                meta["dc:creator"] = ["Author Name"]  # Changed to a list of strings
                meta["dc:description"] = "Description of the document content"
            
            pdf.save()
        logging.info(f"Added document structure and tags to: {pdf_path}")
    except Exception as e:
        logging.error(f"Error adding document structure: {e}")
        logging.error(f"Error type: {type(e)}")
        logging.error(f"Error details: {str(e)}")
        import traceback
        logging.error(f"Traceback: {traceback.format_exc()}")

 def create_thumbnail(pdf_path, output_directory, size=(220, 220)):
    """Create a thumbnail of the first page of the PDF"""
    try:
        doc = fitz.open(pdf_path)
        page = doc.load_page(0)  # Load the first page
        pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        
        # Resize the image to the specified size
        img.thumbnail(size)
        
        # Save the thumbnail in the output directory
        thumbnail_filename = os.path.basename(pdf_path).replace('_Service_File.pdf', '_thumbnail.jpg')
        thumbnail_path = os.path.join(output_directory, thumbnail_filename)
        img.save(thumbnail_path, "JPEG", quality=85, optimize=True)
        doc.close()
        logging.info(f"Thumbnail created and saved as: {thumbnail_path}")
        return thumbnail_path
    except Exception as e:
        logging.error(f"Error creating thumbnail: {e}")
        return None

 def make_pdf_accessible(pdf_path, output_directory, thumbnail_size=(220, 220)):
    """Main function to process the PDF for accessibility and create a thumbnail"""
    try:
        analyze_pdf_structure(pdf_path)
        accessible_pdf_path = add_accessibility_features(pdf_path, output_directory)
        add_document_structure(accessible_pdf_path)
        thumbnail_path = create_thumbnail(accessible_pdf_path, output_directory, size=thumbnail_size)
        if thumbnail_path:
            logging.info(f"PDF '{pdf_path}' has been processed for accessibility and thumbnail created.")
        else:
            logging.info(f"PDF '{pdf_path}' has been processed for accessibility, but thumbnail creation failed.")
    except Exception as e:
        logging.error(f"Error processing PDF: {e}")

 if __name__ == "__main__":
    if len(sys.argv) < 2 or len(sys.argv) > 3:
        print("Usage: python process_pdf_for_compliance.py <path_to_pdf_or_url> [thumbnail_size]")
        sys.exit(1)
    
    input_source = sys.argv[1]
    
    # Define input and output directories
    input_directory = "input"
    output_directory = "output"
    
    # Ensure input and output directories exist
    if not os.path.exists(input_directory):
        os.makedirs(input_directory)
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)
    
    # Handle input source (URL or local file)
    if input_source.startswith("http://") or input_source.startswith("https://"):
        # It's a URL, download the file
        logging.info(f"Downloading file from URL: {input_source}")
        input_file = os.path.join(input_directory, os.path.basename(input_source))
        file_to_process = download_file(input_source, input_file)
    else:
        # Assume it's a local file path in the input directory
        input_file = os.path.join(input_directory, os.path.basename(input_source))
        if not os.path.exists(input_file):
            logging.error(f"File does not exist in input directory: {input_file}")
            sys.exit(1)
        file_to_process = input_file

    # Determine the thumbnail size
    thumbnail_size = (220, 220)
    if len(sys.argv) == 3:
        try:
            size_value = int(sys.argv[2])
            thumbnail_size = (size_value, size_value)
        except ValueError:
            logging.warning("Invalid thumbnail size provided. Using default size 220x220.")

    # Process the PDF for accessibility and create the thumbnail
    make_pdf_accessible(file_to_process, output_directory, thumbnail_size=thumbnail_size)

    # After processing, if the file was downloaded, remove it
    if input_source.startswith("http://") or input_source.startswith("https://"):
        logging.info(f"Cleaning up downloaded file: {input_file}")
        os.remove(input_file)
	# Use an official Python runtime as a parent image
	FROM python:3.10-slim

	# Set environment variables
	ENV PYTHONUNBUFFERED=1

	# Install dependencies
	RUN apt-get update && \
	apt-get install -y --no-install-recommends \
	tesseract-ocr \
	libtesseract-dev \
	poppler-utils \
	libjpeg-dev \
	libopenjp2-7-dev \
	libfreetype6-dev \
	liblcms2-dev \
	libharfbuzz-dev \
	libfribidi-dev \
	libxcb1-dev \
	ghostscript && \
	pip install --no-cache-dir \
	pytesseract \
	PyMuPDF \
	pikepdf \
	Pillow \
	requests && \
	apt-get clean && \
	rm -rf /var/lib/apt/lists/*

	# Create a non-root user and switch to it
	RUN useradd -ms /bin/bash appuser
	USER appuser

	# Create and set the working directory
	WORKDIR /home/appuser/app

	# Copy the current directory contents into the container at /home/appuser/app
	COPY --chown=appuser:appuser . .

	# Set the CMD to launch a bash shell in the working directory
	CMD ["bash"]
	import os
	import sys
	import logging
	import fitz # PyMuPDF
	import pikepdf
	from pikepdf import Pdf, Name, Dictionary, Array
	import io
	from PIL import Image
	import pytesseract
	from pytesseract import Output
	import requests

	# Set up logging
	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

	def download_file(url, output_path):
	try:
	response = requests.get(url, allow_redirects=True)
	response.raise_for_status() # Ensure we notice bad responses
	with open(output_path, 'wb') as file:
	file.write(response.content)
	return output_path
	except Exception as e:
	logging.error(f"Failed to download the file: {e}")
	sys.exit(1)

	def check_file_exists(file_path):
	if not os.path.isfile(file_path):
	logging.error(f"File does not exist: {file_path}")
	sys.exit(1)

	def analyze_pdf_structure(pdf_path):
	"""Analyze and log the structure of the PDF"""
	doc = fitz.open(pdf_path)
	logging.info(f"PDF Analysis for {pdf_path}:")
	logging.info(f"Number of pages: {len(doc)}")
	for page_num in range(len(doc)):
	page = doc[page_num]
	logging.info(f"Page {page_num + 1}:")
	logging.info(f" Text length: {len(page.get_text())}")
	doc.close()

	def process_page(page):
	"""Process a single page for OCR"""
	try:
	pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
	img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
	ocr_result = pytesseract.image_to_data(img, output_type=Output.DICT, config='--psm 6')
	text = " ".join(word for word in ocr_result['text'] if word.strip())
	return text
	except Exception as e:
	logging.error(f"Error processing page: {e}")
	return ""

	def add_accessibility_features(pdf_path, output_directory):
	"""Add accessibility features to the PDF"""
	doc = fitz.open(pdf_path)
	for page_num in range(len(doc)):
	page = doc[page_num]
	try:
	ocr_text = process_page(page)
	if ocr_text:
	page.insert_text((10, 10), ocr_text, fontsize=1, color=(1, 1, 1))
	except Exception as e:
	logging.error(f"Error processing page {page_num + 1}: {e}")

	# Save the modified PDF in the output directory
	output_filename = os.path.basename(pdf_path).replace('.pdf', '_Service_File.pdf')
	output_path = os.path.join(output_directory, output_filename)
	doc.save(output_path, garbage=4, deflate=True)
	doc.close()
	logging.info(f"Saved Service_File PDF as: {output_path}")
	return output_path

	def add_document_structure(pdf_path):
	"""Add document structure and tags to the PDF"""
	try:
	with pikepdf.Pdf.open(pdf_path, allow_overwriting_input=True) as pdf:
	if "/StructTreeRoot" not in pdf.Root:
	pdf.Root["/StructTreeRoot"] = pikepdf.Dictionary({
	"/Type": "/StructTreeRoot",
	"/K": pikepdf.Array(),
	"/ParentTree": pikepdf.Dictionary({"/Nums": pikepdf.Array()}),
	"/ParentTreeNextKey": 0
	})

	if "/Lang" not in pdf.Root:
	pdf.Root["/Lang"] = 'en-US'

	with pdf.open_metadata() as meta:
	meta["dc:title"] = "Document Title"
	meta["dc:creator"] = ["Author Name"] # Changed to a list of strings
	meta["dc:description"] = "Description of the document content"

	pdf.save()
	logging.info(f"Added document structure and tags to: {pdf_path}")
	except Exception as e:
	logging.error(f"Error adding document structure: {e}")
	logging.error(f"Error type: {type(e)}")
	logging.error(f"Error details: {str(e)}")
	import traceback
	logging.error(f"Traceback: {traceback.format_exc()}")

	def create_thumbnail(pdf_path, output_directory, size=(220, 220)):
	"""Create a thumbnail of the first page of the PDF"""
	try:
	doc = fitz.open(pdf_path)
	page = doc.load_page(0) # Load the first page
	pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
	img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)

	# Resize the image to the specified size
	img.thumbnail(size)

	# Save the thumbnail in the output directory
	thumbnail_filename = os.path.basename(pdf_path).replace('_Service_File.pdf', '_thumbnail.jpg')
	thumbnail_path = os.path.join(output_directory, thumbnail_filename)
	img.save(thumbnail_path, "JPEG", quality=85, optimize=True)
	doc.close()
	logging.info(f"Thumbnail created and saved as: {thumbnail_path}")
	return thumbnail_path
	except Exception as e:
	logging.error(f"Error creating thumbnail: {e}")
	return None

	def make_pdf_accessible(pdf_path, output_directory, thumbnail_size=(220, 220)):
	"""Main function to process the PDF for accessibility and create a thumbnail"""
	try:
	analyze_pdf_structure(pdf_path)
	accessible_pdf_path = add_accessibility_features(pdf_path, output_directory)
	add_document_structure(accessible_pdf_path)
	thumbnail_path = create_thumbnail(accessible_pdf_path, output_directory, size=thumbnail_size)
	if thumbnail_path:
	logging.info(f"PDF '{pdf_path}' has been processed for accessibility and thumbnail created.")
	else:
	logging.info(f"PDF '{pdf_path}' has been processed for accessibility, but thumbnail creation failed.")
	except Exception as e:
	logging.error(f"Error processing PDF: {e}")

	if __name__ == "__main__":
	if len(sys.argv) < 2 or len(sys.argv) > 3:
	print("Usage: python process_pdf_for_compliance.py <path_to_pdf_or_url> [thumbnail_size]")
	sys.exit(1)

	input_source = sys.argv[1]

	# Define input and output directories
	input_directory = "input"
	output_directory = "output"

	# Ensure input and output directories exist
	if not os.path.exists(input_directory):
	os.makedirs(input_directory)
	if not os.path.exists(output_directory):
	os.makedirs(output_directory)

	# Handle input source (URL or local file)
	if input_source.startswith("http://") or input_source.startswith("https://"):
	# It's a URL, download the file
	logging.info(f"Downloading file from URL: {input_source}")
	input_file = os.path.join(input_directory, os.path.basename(input_source))
	file_to_process = download_file(input_source, input_file)
	else:
	# Assume it's a local file path in the input directory
	input_file = os.path.join(input_directory, os.path.basename(input_source))
	if not os.path.exists(input_file):
	logging.error(f"File does not exist in input directory: {input_file}")
	sys.exit(1)
	file_to_process = input_file

	# Determine the thumbnail size
	thumbnail_size = (220, 220)
	if len(sys.argv) == 3:
	try:
	size_value = int(sys.argv[2])
	thumbnail_size = (size_value, size_value)
	except ValueError:
	logging.warning("Invalid thumbnail size provided. Using default size 220x220.")

	# Process the PDF for accessibility and create the thumbnail
	make_pdf_accessible(file_to_process, output_directory, thumbnail_size=thumbnail_size)

	# After processing, if the file was downloaded, remove it
	if input_source.startswith("http://") or input_source.startswith("https://"):
	logging.info(f"Cleaning up downloaded file: {input_file}")
	os.remove(input_file)