Skip to content

Instantly share code, notes, and snippets.

@EduardoSP6
Last active January 8, 2025 16:14
Show Gist options
  • Save EduardoSP6/adafaa4821ee1293be13262c6ea38692 to your computer and use it in GitHub Desktop.
Save EduardoSP6/adafaa4821ee1293be13262c6ea38692 to your computer and use it in GitHub Desktop.
How to convert PDF to Text with Tesseract OCR

How to convert PDF file to text with Tesseract OCR and Imagick

Instalation

Here we need to install the Tesseract and Imagick (to convert PDF to image)

RUN apt-get update && apt-get install -y \
    libmagickwand-dev \
    imagemagick \
    ghostscript \
    tesseract-ocr-por \
    && apt-get autoremove --purge -y && apt-get clean \
    && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
    
RUN pecl install imagick \
    && docker-php-ext-enable imagick
    
RUN sed -i 's/rights="none" pattern="PDF"/rights="read|write" pattern="PDF"/' /etc/ImageMagick-6/policy.xml    

Example using PdfToText class

  $pdf = public_path("cnh.pdf");
  $pdfToText = new PdfToText($pdf);

  $text = $pdfToText->convert();
  
  dump($text);
<?php
namespace Infrastructure\Utils\OCR;
use Exception;
use Imagick;
use ImagickException;
use RuntimeException;
use thiagoalessio\TesseractOCR\TesseractOCR;
class PdfToText
{
private string $pdfPath;
private string $tempDir;
private ?int $resolution;
public function __construct(string $pdfPath, string $tempDir = 'temp', ?int $resolution = 300)
{
$this->pdfPath = $pdfPath;
$this->tempDir = $tempDir;
$this->resolution = $resolution;
if (!is_dir($this->tempDir)) {
mkdir($this->tempDir, 0777, true);
}
}
/**
* Convert PDF content to image.
* @param int|null $startPage
* @param int|null $endPage
* @return string
*/
public function convert(int $startPage = null, int $endPage = null): string
{
try {
$imagick = new Imagick();
$imagick->setResolution($this->resolution, $this->resolution);
$imagick->readImage($this->pdfPath);
$allText = [];
$totalPages = $imagick->getNumberImages();
$startPage = $startPage ?: 1;
$endPage = $endPage ?: $totalPages;
for ($i = $startPage - 1; $i < $endPage; $i++) {
// set the current page
$imagick->setIteratorIndex($i);
// convert to png
$imagick->setImageFormat('png');
// improve image quality
$imagick->transformImageColorspace(Imagick::COLORSPACE_RGB);
$imagick->normalizeImage();
$imagick->enhanceImage();
$imagick->sharpenImage(0, 1);
// save page to a temp file
$tempImagePath = sprintf('%s/page_%d.png', $this->tempDir, $i + 1);
$imagick->writeImage($tempImagePath);
/*
* Process the OCR
*
* PSM (Page Segmentation Mode):
* 0 = Orientation and script detection (OSD) only
* 1 = Automatic page segmentation with OSD
* 2 = Automatic page segmentation, but no OSD, or OCR
* 3 = Fully automatic page segmentation, but no OSD (Default)
* 4 = Assume a single column of text of variable sizes
* 5 = Assume a single uniform block of vertically aligned text
* 6 = Assume a single uniform block of text
* 7 = Treat the image as a single text line
* 8 = Treat the image as a single word
* 9 = Treat the image as a single word in a circle
* 10 = Treat the image as a single character
* 11 = Sparse text. Find as much text as possible in no particular order
* 12 = Sparse text with OSD
* 13 = Raw line. Treat the image as a single text line
*
* OEM (OCR Engine Mode):
* 0 = Legacy engine only
* 1 = Neural nets LSTM engine only
* 2 = Legacy + LSTM engines
* 3 = Default, based on what is available
*/
$ocr = new TesseractOCR($tempImagePath);
$ocr->lang('por')
->psm(6)
->oem(1);
$pageText = $ocr->run();
$allText[] = sprintf("=== Página %d ===\n%s", $i + 1, $pageText);
unlink($tempImagePath);
}
$imagick->clear();
$imagick->destroy();
return implode("\n\n", $allText);
} catch (ImagickException $e) {
throw new RuntimeException("Image processing error: " . $e->getMessage());
} catch (Exception $e) {
throw new RuntimeException("Error: " . $e->getMessage());
}
}
/**
* Check if PDF file is protected.
* @return bool
*/
public function isProtected(): bool
{
try {
$imagick = new Imagick();
$imagick->pingImage($this->pdfPath);
$imagick->destroy();
return false;
} catch (ImagickException) {
return true;
}
}
/**
* Get PDF info.
* @return array
*/
public function getPDFInfo(): array
{
try {
$imagick = new Imagick();
$imagick->readImage($this->pdfPath);
$info = [
'numberOfPages' => $imagick->getNumberImages(),
'resolution' => $imagick->getImageResolution(),
'format' => $imagick->getImageFormat(),
'colorspace' => $imagick->getImageColorspace(),
];
$imagick->destroy();
return $info;
} catch (ImagickException $e) {
throw new RuntimeException("Fail to get PDF info: " . $e->getMessage());
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment