EduardoSP6 · January 8, 2025 16:14
diff --git a/pdf_to_text_tesseract_orc.md b/pdf_to_text_tesseract_orc.md
diff --git a/PdfToText.php b/PdfToText.php
 <?php

 namespace Infrastructure\Utils\OCR;

 use Exception;
 use Imagick;
 use ImagickException;
 use RuntimeException;
 use thiagoalessio\TesseractOCR\TesseractOCR;

 class PdfToText
 {
    private string $pdfPath;
    private string $tempDir;
    private ?int $resolution;

    public function __construct(string $pdfPath, string $tempDir = 'temp', ?int $resolution = 300)
    {
        $this->pdfPath = $pdfPath;
        $this->tempDir = $tempDir;
        $this->resolution = $resolution;

        if (!is_dir($this->tempDir)) {
            mkdir($this->tempDir, 0777, true);
        }
    }

    /**
     * Convert PDF content to image.
     * @param int|null $startPage
     * @param int|null $endPage
     * @return string
     */
    public function convert(int $startPage = null, int $endPage = null): string
    {
        try {
            $imagick = new Imagick();
            $imagick->setResolution($this->resolution, $this->resolution);
            $imagick->readImage($this->pdfPath);

            $allText = [];
            $totalPages = $imagick->getNumberImages();

            $startPage = $startPage ?: 1;
            $endPage = $endPage ?: $totalPages;

            for ($i = $startPage - 1; $i < $endPage; $i++) {
                // set the current page
                $imagick->setIteratorIndex($i);

                // convert to png
                $imagick->setImageFormat('png');

                // improve image quality
                $imagick->transformImageColorspace(Imagick::COLORSPACE_RGB);
                $imagick->normalizeImage();
                $imagick->enhanceImage();
                $imagick->sharpenImage(0, 1);

                // save page to a temp file
                $tempImagePath = sprintf('%s/page_%d.png', $this->tempDir, $i + 1);
                $imagick->writeImage($tempImagePath);

                /*
                 * Process the OCR
                 *
                 * PSM (Page Segmentation Mode):
                 *  0  = Orientation and script detection (OSD) only
                 *  1  = Automatic page segmentation with OSD
                 *  2  = Automatic page segmentation, but no OSD, or OCR
                 *  3  = Fully automatic page segmentation, but no OSD (Default)
                 *  4  = Assume a single column of text of variable sizes
                 *  5  = Assume a single uniform block of vertically aligned text
                 *  6  = Assume a single uniform block of text
                 *  7  = Treat the image as a single text line
                 *  8  = Treat the image as a single word
                 *  9  = Treat the image as a single word in a circle
                 *  10 = Treat the image as a single character
                 *  11 = Sparse text. Find as much text as possible in no particular order
                 *  12 = Sparse text with OSD
                 *  13 = Raw line. Treat the image as a single text line
                 *
                 * OEM (OCR Engine Mode):
                 * 0 = Legacy engine only
                 * 1 = Neural nets LSTM engine only
                 * 2 = Legacy + LSTM engines
                 * 3 = Default, based on what is available
                 */
                $ocr = new TesseractOCR($tempImagePath);
                $ocr->lang('por')
                    ->psm(6)
                    ->oem(1);

                $pageText = $ocr->run();
                $allText[] = sprintf("=== Página %d ===\n%s", $i + 1, $pageText);

                unlink($tempImagePath);
            }

            $imagick->clear();
            $imagick->destroy();

            return implode("\n\n", $allText);

        } catch (ImagickException $e) {
            throw new RuntimeException("Image processing error: " . $e->getMessage());
        } catch (Exception $e) {
            throw new RuntimeException("Error: " . $e->getMessage());
        }
    }

    /**
     * Check if PDF file is protected.
     * @return bool
     */
    public function isProtected(): bool
    {
        try {
            $imagick = new Imagick();
            $imagick->pingImage($this->pdfPath);
            $imagick->destroy();
            return false;
        } catch (ImagickException) {
            return true;
        }
    }

    /**
     * Get PDF info.
     * @return array
     */
    public function getPDFInfo(): array
    {
        try {
            $imagick = new Imagick();
            $imagick->readImage($this->pdfPath);

            $info = [
                'numberOfPages' => $imagick->getNumberImages(),
                'resolution' => $imagick->getImageResolution(),
                'format' => $imagick->getImageFormat(),
                'colorspace' => $imagick->getImageColorspace(),
            ];

            $imagick->destroy();

            return $info;
        } catch (ImagickException $e) {
            throw new RuntimeException("Fail to get PDF info: " . $e->getMessage());
        }
    }
 }
	<?php

	namespace Infrastructure\Utils\OCR;

	use Exception;
	use Imagick;
	use ImagickException;
	use RuntimeException;
	use thiagoalessio\TesseractOCR\TesseractOCR;

	class PdfToText
	{
	private string $pdfPath;
	private string $tempDir;
	private ?int $resolution;

	public function __construct(string $pdfPath, string $tempDir = 'temp', ?int $resolution = 300)
	{
	$this->pdfPath = $pdfPath;
	$this->tempDir = $tempDir;
	$this->resolution = $resolution;

	if (!is_dir($this->tempDir)) {
	mkdir($this->tempDir, 0777, true);
	}
	}

	/**
	* Convert PDF content to image.
	* @param int\|null $startPage
	* @param int\|null $endPage
	* @return string
	*/
	public function convert(int $startPage = null, int $endPage = null): string
	{
	try {
	$imagick = new Imagick();
	$imagick->setResolution($this->resolution, $this->resolution);
	$imagick->readImage($this->pdfPath);

	$allText = [];
	$totalPages = $imagick->getNumberImages();

	$startPage = $startPage ?: 1;
	$endPage = $endPage ?: $totalPages;

	for ($i = $startPage - 1; $i < $endPage; $i++) {
	// set the current page
	$imagick->setIteratorIndex($i);

	// convert to png
	$imagick->setImageFormat('png');

	// improve image quality
	$imagick->transformImageColorspace(Imagick::COLORSPACE_RGB);
	$imagick->normalizeImage();
	$imagick->enhanceImage();
	$imagick->sharpenImage(0, 1);

	// save page to a temp file
	$tempImagePath = sprintf('%s/page_%d.png', $this->tempDir, $i + 1);
	$imagick->writeImage($tempImagePath);

	/*
	* Process the OCR
	*
	* PSM (Page Segmentation Mode):
	* 0 = Orientation and script detection (OSD) only
	* 1 = Automatic page segmentation with OSD
	* 2 = Automatic page segmentation, but no OSD, or OCR
	* 3 = Fully automatic page segmentation, but no OSD (Default)
	* 4 = Assume a single column of text of variable sizes
	* 5 = Assume a single uniform block of vertically aligned text
	* 6 = Assume a single uniform block of text
	* 7 = Treat the image as a single text line
	* 8 = Treat the image as a single word
	* 9 = Treat the image as a single word in a circle
	* 10 = Treat the image as a single character
	* 11 = Sparse text. Find as much text as possible in no particular order
	* 12 = Sparse text with OSD
	* 13 = Raw line. Treat the image as a single text line
	*
	* OEM (OCR Engine Mode):
	* 0 = Legacy engine only
	* 1 = Neural nets LSTM engine only
	* 2 = Legacy + LSTM engines
	* 3 = Default, based on what is available
	*/
	$ocr = new TesseractOCR($tempImagePath);
	$ocr->lang('por')
	->psm(6)
	->oem(1);

	$pageText = $ocr->run();
	$allText[] = sprintf("=== Página %d ===\n%s", $i + 1, $pageText);

	unlink($tempImagePath);
	}

	$imagick->clear();
	$imagick->destroy();

	return implode("\n\n", $allText);

	} catch (ImagickException $e) {
	throw new RuntimeException("Image processing error: " . $e->getMessage());
	} catch (Exception $e) {
	throw new RuntimeException("Error: " . $e->getMessage());
	}
	}

	/**
	* Check if PDF file is protected.
	* @return bool
	*/
	public function isProtected(): bool
	{
	try {
	$imagick = new Imagick();
	$imagick->pingImage($this->pdfPath);
	$imagick->destroy();
	return false;
	} catch (ImagickException) {
	return true;
	}
	}

	/**
	* Get PDF info.
	* @return array
	*/
	public function getPDFInfo(): array
	{
	try {
	$imagick = new Imagick();
	$imagick->readImage($this->pdfPath);

	$info = [
	'numberOfPages' => $imagick->getNumberImages(),
	'resolution' => $imagick->getImageResolution(),
	'format' => $imagick->getImageFormat(),
	'colorspace' => $imagick->getImageColorspace(),
	];

	$imagick->destroy();

	return $info;
	} catch (ImagickException $e) {
	throw new RuntimeException("Fail to get PDF info: " . $e->getMessage());
	}
	}
	}