7effrey89 · July 9, 2025 08:26
diff --git a/app.py b/app.py
 # Script to create searchable PDF from scan PDF or images using Azure Form Recognizer / Document Intelligence
 #
 # USAGE:
 #   python Untitled-1.py <input_file> [-o <output_file>]
 #
 # ARGUMENTS:
 #   <input_file>   Path to your PDF or image file (jpg, jpeg, tif, tiff, bmp, png).
 #   -o <output_file> (optional) Name for the output searchable PDF.
 #                    If omitted, output will be <input_file>.ocr.pdf
 #
 # EXAMPLES:
 #   python Untitled-1.py myscan.pdf
 #   python Untitled-1.py myscan.pdf -o mysearchable.pdf
 #
 # DEPENDENCIES:
 #   pip install --upgrade azure-ai-formrecognizer>=3.3 pypdf>=3.0 reportlab pillow pdf2image
 #
 #   You also need Poppler for Windows for pdf2image:
 #     - Download from: https://github.com/oschwartz10612/poppler-windows/releases/
 #     - Extract and update the POPPLER_PATH variable in this script if needed.
 #
 # AZURE CREDENTIALS:
 #   Update the 'endpoint' and 'key' variables in this script with your Azure Form Recognizer / Document Intelligence resource values.
 #
 # OUTPUT:
 #   The script creates a searchable PDF in the same directory as your input file (unless you specify a different output path).
 #   The output file will be named <input_file>.ocr.pdf by default.
 #
 # REFERENCE:
 #   https://techcommunity.microsoft.com/blog/azure-ai-services-blog/generate-searchable-pdfs-with-azure-form-recognizer/3652024
 #
 # ------------------------------------------------------------------------------

 # Script to create searchable PDF from scan PDF or images using Azure Form Recognizer / Document Intelligence
 # Required packages
 # pip install --upgrade azure-ai-formrecognizer>=3.3 pypdf>=3.0 reportlab pillow pdf2image
 #https://techcommunity.microsoft.com/blog/azure-ai-services-blog/generate-searchable-pdfs-with-azure-form-recognizer/3652024
 import sys
 import io
 import math
 import argparse
 from pdf2image import convert_from_path
 from reportlab.pdfgen import canvas
 from reportlab.lib import pagesizes
 from reportlab import rl_config
 from PIL import Image, ImageSequence
 from pypdf import PdfWriter, PdfReader
 from azure.core.credentials import AzureKeyCredential
 from azure.ai.formrecognizer import DocumentAnalysisClient

 # Please provide your Azure Form Recognizer/Document Intelligence endpoint and key
 endpoint = "https://<aifoundry-resource>.cognitiveservices.azure.com/"
 key = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxux"

 # https://github.com/oschwartz10612/poppler-windows/releases/
 # Set your Poppler bin path here
 POPPLER_PATH = r"C:\poppler-24.08.0\Library\bin"


 def dist(p1, p2):
    return math.sqrt((p1.x - p2.x)*(p1.x - p2.x) + (p1.y - p2.y) * (p1.y - p2.y))

 if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('input_file', type=str, help="Input PDF or image (jpg, jpeg, tif, tiff, bmp, png) file name")
    parser.add_argument('-o', '--output', type=str, required=False, default="", help="Output PDF file name. Default: input_file + .ocr.pdf")
    args = parser.parse_args()

    input_file = args.input_file
    if args.output:
        output_file = args.output
    else:
        output_file = input_file + ".ocr.pdf"

    # Loading input file
    print(f"Loading input file {input_file}")
    if input_file.lower().endswith('.pdf'):
        # read existing PDF as images
        image_pages = convert_from_path(input_file, poppler_path=POPPLER_PATH)
    elif input_file.lower().endswith(('.tif', '.tiff', '.jpg', '.jpeg', '.png', '.bmp')):
        # read input image (potential multi page Tiff)
        image_pages = ImageSequence.Iterator(Image.open(input_file))
    else:
        sys.exit(f"Error: Unsupported input file extension {input_file}. Supported extensions: PDF, TIF, TIFF, JPG, JPEG, PNG, BMP.")

    # Running OCR using Azure Form Recognizer Read API 
    print(f"Starting Azure Form Recognizer OCR process...")
    document_analysis_client = DocumentAnalysisClient(endpoint=endpoint, credential=AzureKeyCredential(key), headers={"x-ms-useragent": "searchable-pdf-blog/1.0.0"})

    with open(input_file, "rb") as f:
        poller = document_analysis_client.begin_analyze_document("prebuilt-read", document = f)

    ocr_results = poller.result()
    print(f"Azure Form Recognizer finished OCR text for {len(ocr_results.pages)} pages.")

    # Generate OCR overlay layer
    print(f"Generating searchable PDF...")
    output = PdfWriter()
    default_font = "Times-Roman"
    for page_id, page in enumerate(ocr_results.pages):
        ocr_overlay = io.BytesIO()

        # Calculate overlay PDF page size
        if image_pages[page_id].height > image_pages[page_id].width:
            page_scale = float(image_pages[page_id].height) / pagesizes.letter[1]
        else:
            page_scale = float(image_pages[page_id].width) / pagesizes.letter[1]

        page_width = float(image_pages[page_id].width) / page_scale
        page_height = float(image_pages[page_id].height) / page_scale

        scale = (page_width / page.width + page_height / page.height) / 2.0
        pdf_canvas = canvas.Canvas(ocr_overlay, pagesize=(page_width, page_height))

        # Compress image before embedding: convert to JPEG in memory
        img = image_pages[page_id]
        if img.mode != 'RGB':
            img = img.convert('RGB')
        img_bytes = io.BytesIO()
        img.save(img_bytes, format='JPEG', quality=70, optimize=True)
        img_bytes.seek(0)
        compressed_img = Image.open(img_bytes)
        pdf_canvas.drawInlineImage(compressed_img, 0, 0, width=page_width, height=page_height, preserveAspectRatio=True)

        text = pdf_canvas.beginText()
        # Set text rendering mode to invisible
        text.setTextRenderMode(3)
        for word in page.words:
            # Calculate optimal font size
            desired_text_width = max(dist(word.polygon[0], word.polygon[1]), dist(word.polygon[3], word.polygon[2])) * scale
            desired_text_height = max(dist(word.polygon[1], word.polygon[2]), dist(word.polygon[0], word.polygon[3])) * scale
            font_size = desired_text_height
            actual_text_width = pdf_canvas.stringWidth(word.content, default_font, font_size)
            
            # Calculate text rotation angle
            text_angle = math.atan2((word.polygon[1].y - word.polygon[0].y + word.polygon[2].y - word.polygon[3].y) / 2.0, 
                                    (word.polygon[1].x - word.polygon[0].x + word.polygon[2].x - word.polygon[3].x) / 2.0)
            text.setFont(default_font, font_size)
            text.setTextTransform(math.cos(text_angle), -math.sin(text_angle), math.sin(text_angle), math.cos(text_angle), word.polygon[3].x * scale, page_height - word.polygon[3].y * scale)
            text.setHorizScale(desired_text_width / actual_text_width * 100)
            text.textOut(word.content + " ")

        pdf_canvas.drawText(text)
        pdf_canvas.save()

        # Move to the beginning of the buffer
        ocr_overlay.seek(0)

        # Create a new PDF page
        new_pdf_page = PdfReader(ocr_overlay)
        output.add_page(new_pdf_page.pages[0])

    # Save output searchable PDF file
    with open(output_file, "wb") as outputStream:
        output.write(outputStream)

    print(f"Searchable PDF is created: {output_file}")
	# Script to create searchable PDF from scan PDF or images using Azure Form Recognizer / Document Intelligence
	#
	# USAGE:
	# python Untitled-1.py <input_file> [-o <output_file>]
	#
	# ARGUMENTS:
	# <input_file> Path to your PDF or image file (jpg, jpeg, tif, tiff, bmp, png).
	# -o <output_file> (optional) Name for the output searchable PDF.
	# If omitted, output will be <input_file>.ocr.pdf
	#
	# EXAMPLES:
	# python Untitled-1.py myscan.pdf
	# python Untitled-1.py myscan.pdf -o mysearchable.pdf
	#
	# DEPENDENCIES:
	# pip install --upgrade azure-ai-formrecognizer>=3.3 pypdf>=3.0 reportlab pillow pdf2image
	#
	# You also need Poppler for Windows for pdf2image:
	# - Download from: https://github.com/oschwartz10612/poppler-windows/releases/
	# - Extract and update the POPPLER_PATH variable in this script if needed.
	#
	# AZURE CREDENTIALS:
	# Update the 'endpoint' and 'key' variables in this script with your Azure Form Recognizer / Document Intelligence resource values.
	#
	# OUTPUT:
	# The script creates a searchable PDF in the same directory as your input file (unless you specify a different output path).
	# The output file will be named <input_file>.ocr.pdf by default.
	#
	# REFERENCE:
	# https://techcommunity.microsoft.com/blog/azure-ai-services-blog/generate-searchable-pdfs-with-azure-form-recognizer/3652024
	#
	# ------------------------------------------------------------------------------

	# Script to create searchable PDF from scan PDF or images using Azure Form Recognizer / Document Intelligence
	# Required packages
	# pip install --upgrade azure-ai-formrecognizer>=3.3 pypdf>=3.0 reportlab pillow pdf2image
	#https://techcommunity.microsoft.com/blog/azure-ai-services-blog/generate-searchable-pdfs-with-azure-form-recognizer/3652024
	import sys
	import io
	import math
	import argparse
	from pdf2image import convert_from_path
	from reportlab.pdfgen import canvas
	from reportlab.lib import pagesizes
	from reportlab import rl_config
	from PIL import Image, ImageSequence
	from pypdf import PdfWriter, PdfReader
	from azure.core.credentials import AzureKeyCredential
	from azure.ai.formrecognizer import DocumentAnalysisClient

	# Please provide your Azure Form Recognizer/Document Intelligence endpoint and key
	endpoint = "https://<aifoundry-resource>.cognitiveservices.azure.com/"
	key = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxux"

	# https://github.com/oschwartz10612/poppler-windows/releases/
	# Set your Poppler bin path here
	POPPLER_PATH = r"C:\poppler-24.08.0\Library\bin"


	def dist(p1, p2):
	return math.sqrt((p1.x - p2.x)(p1.x - p2.x) + (p1.y - p2.y) (p1.y - p2.y))

	if __name__ == '__main__':
	parser = argparse.ArgumentParser()
	parser.add_argument('input_file', type=str, help="Input PDF or image (jpg, jpeg, tif, tiff, bmp, png) file name")
	parser.add_argument('-o', '--output', type=str, required=False, default="", help="Output PDF file name. Default: input_file + .ocr.pdf")
	args = parser.parse_args()

	input_file = args.input_file
	if args.output:
	output_file = args.output
	else:
	output_file = input_file + ".ocr.pdf"

	# Loading input file
	print(f"Loading input file {input_file}")
	if input_file.lower().endswith('.pdf'):
	# read existing PDF as images
	image_pages = convert_from_path(input_file, poppler_path=POPPLER_PATH)
	elif input_file.lower().endswith(('.tif', '.tiff', '.jpg', '.jpeg', '.png', '.bmp')):
	# read input image (potential multi page Tiff)
	image_pages = ImageSequence.Iterator(Image.open(input_file))
	else:
	sys.exit(f"Error: Unsupported input file extension {input_file}. Supported extensions: PDF, TIF, TIFF, JPG, JPEG, PNG, BMP.")

	# Running OCR using Azure Form Recognizer Read API
	print(f"Starting Azure Form Recognizer OCR process...")
	document_analysis_client = DocumentAnalysisClient(endpoint=endpoint, credential=AzureKeyCredential(key), headers={"x-ms-useragent": "searchable-pdf-blog/1.0.0"})

	with open(input_file, "rb") as f:
	poller = document_analysis_client.begin_analyze_document("prebuilt-read", document = f)

	ocr_results = poller.result()
	print(f"Azure Form Recognizer finished OCR text for {len(ocr_results.pages)} pages.")

	# Generate OCR overlay layer
	print(f"Generating searchable PDF...")
	output = PdfWriter()
	default_font = "Times-Roman"
	for page_id, page in enumerate(ocr_results.pages):
	ocr_overlay = io.BytesIO()

	# Calculate overlay PDF page size
	if image_pages[page_id].height > image_pages[page_id].width:
	page_scale = float(image_pages[page_id].height) / pagesizes.letter[1]
	else:
	page_scale = float(image_pages[page_id].width) / pagesizes.letter[1]

	page_width = float(image_pages[page_id].width) / page_scale
	page_height = float(image_pages[page_id].height) / page_scale

	scale = (page_width / page.width + page_height / page.height) / 2.0
	pdf_canvas = canvas.Canvas(ocr_overlay, pagesize=(page_width, page_height))

	# Compress image before embedding: convert to JPEG in memory
	img = image_pages[page_id]
	if img.mode != 'RGB':
	img = img.convert('RGB')
	img_bytes = io.BytesIO()
	img.save(img_bytes, format='JPEG', quality=70, optimize=True)
	img_bytes.seek(0)
	compressed_img = Image.open(img_bytes)
	pdf_canvas.drawInlineImage(compressed_img, 0, 0, width=page_width, height=page_height, preserveAspectRatio=True)

	text = pdf_canvas.beginText()
	# Set text rendering mode to invisible
	text.setTextRenderMode(3)
	for word in page.words:
	# Calculate optimal font size
	desired_text_width = max(dist(word.polygon[0], word.polygon[1]), dist(word.polygon[3], word.polygon[2])) * scale
	desired_text_height = max(dist(word.polygon[1], word.polygon[2]), dist(word.polygon[0], word.polygon[3])) * scale
	font_size = desired_text_height
	actual_text_width = pdf_canvas.stringWidth(word.content, default_font, font_size)

	# Calculate text rotation angle
	text_angle = math.atan2((word.polygon[1].y - word.polygon[0].y + word.polygon[2].y - word.polygon[3].y) / 2.0,
	(word.polygon[1].x - word.polygon[0].x + word.polygon[2].x - word.polygon[3].x) / 2.0)
	text.setFont(default_font, font_size)
	text.setTextTransform(math.cos(text_angle), -math.sin(text_angle), math.sin(text_angle), math.cos(text_angle), word.polygon[3].x * scale, page_height - word.polygon[3].y * scale)
	text.setHorizScale(desired_text_width / actual_text_width * 100)
	text.textOut(word.content + " ")

	pdf_canvas.drawText(text)
	pdf_canvas.save()

	# Move to the beginning of the buffer
	ocr_overlay.seek(0)

	# Create a new PDF page
	new_pdf_page = PdfReader(ocr_overlay)
	output.add_page(new_pdf_page.pages[0])

	# Save output searchable PDF file
	with open(output_file, "wb") as outputStream:
	output.write(outputStream)

	print(f"Searchable PDF is created: {output_file}")
No results found