|
<?php |
|
|
|
namespace Infrastructure\Utils\OCR; |
|
|
|
use Exception; |
|
use Imagick; |
|
use ImagickException; |
|
use RuntimeException; |
|
use thiagoalessio\TesseractOCR\TesseractOCR; |
|
|
|
class PdfToText |
|
{ |
|
private string $pdfPath; |
|
private string $tempDir; |
|
private ?int $resolution; |
|
|
|
public function __construct(string $pdfPath, string $tempDir = 'temp', ?int $resolution = 300) |
|
{ |
|
$this->pdfPath = $pdfPath; |
|
$this->tempDir = $tempDir; |
|
$this->resolution = $resolution; |
|
|
|
if (!is_dir($this->tempDir)) { |
|
mkdir($this->tempDir, 0777, true); |
|
} |
|
} |
|
|
|
/** |
|
* Convert PDF content to image. |
|
* @param int|null $startPage |
|
* @param int|null $endPage |
|
* @return string |
|
*/ |
|
public function convert(int $startPage = null, int $endPage = null): string |
|
{ |
|
try { |
|
$imagick = new Imagick(); |
|
$imagick->setResolution($this->resolution, $this->resolution); |
|
$imagick->readImage($this->pdfPath); |
|
|
|
$allText = []; |
|
$totalPages = $imagick->getNumberImages(); |
|
|
|
$startPage = $startPage ?: 1; |
|
$endPage = $endPage ?: $totalPages; |
|
|
|
for ($i = $startPage - 1; $i < $endPage; $i++) { |
|
// set the current page |
|
$imagick->setIteratorIndex($i); |
|
|
|
// convert to png |
|
$imagick->setImageFormat('png'); |
|
|
|
// improve image quality |
|
$imagick->transformImageColorspace(Imagick::COLORSPACE_RGB); |
|
$imagick->normalizeImage(); |
|
$imagick->enhanceImage(); |
|
$imagick->sharpenImage(0, 1); |
|
|
|
// save page to a temp file |
|
$tempImagePath = sprintf('%s/page_%d.png', $this->tempDir, $i + 1); |
|
$imagick->writeImage($tempImagePath); |
|
|
|
/* |
|
* Process the OCR |
|
* |
|
* PSM (Page Segmentation Mode): |
|
* 0 = Orientation and script detection (OSD) only |
|
* 1 = Automatic page segmentation with OSD |
|
* 2 = Automatic page segmentation, but no OSD, or OCR |
|
* 3 = Fully automatic page segmentation, but no OSD (Default) |
|
* 4 = Assume a single column of text of variable sizes |
|
* 5 = Assume a single uniform block of vertically aligned text |
|
* 6 = Assume a single uniform block of text |
|
* 7 = Treat the image as a single text line |
|
* 8 = Treat the image as a single word |
|
* 9 = Treat the image as a single word in a circle |
|
* 10 = Treat the image as a single character |
|
* 11 = Sparse text. Find as much text as possible in no particular order |
|
* 12 = Sparse text with OSD |
|
* 13 = Raw line. Treat the image as a single text line |
|
* |
|
* OEM (OCR Engine Mode): |
|
* 0 = Legacy engine only |
|
* 1 = Neural nets LSTM engine only |
|
* 2 = Legacy + LSTM engines |
|
* 3 = Default, based on what is available |
|
*/ |
|
$ocr = new TesseractOCR($tempImagePath); |
|
$ocr->lang('por') |
|
->psm(6) |
|
->oem(1); |
|
|
|
$pageText = $ocr->run(); |
|
$allText[] = sprintf("=== Página %d ===\n%s", $i + 1, $pageText); |
|
|
|
unlink($tempImagePath); |
|
} |
|
|
|
$imagick->clear(); |
|
$imagick->destroy(); |
|
|
|
return implode("\n\n", $allText); |
|
|
|
} catch (ImagickException $e) { |
|
throw new RuntimeException("Image processing error: " . $e->getMessage()); |
|
} catch (Exception $e) { |
|
throw new RuntimeException("Error: " . $e->getMessage()); |
|
} |
|
} |
|
|
|
/** |
|
* Check if PDF file is protected. |
|
* @return bool |
|
*/ |
|
public function isProtected(): bool |
|
{ |
|
try { |
|
$imagick = new Imagick(); |
|
$imagick->pingImage($this->pdfPath); |
|
$imagick->destroy(); |
|
return false; |
|
} catch (ImagickException) { |
|
return true; |
|
} |
|
} |
|
|
|
/** |
|
* Get PDF info. |
|
* @return array |
|
*/ |
|
public function getPDFInfo(): array |
|
{ |
|
try { |
|
$imagick = new Imagick(); |
|
$imagick->readImage($this->pdfPath); |
|
|
|
$info = [ |
|
'numberOfPages' => $imagick->getNumberImages(), |
|
'resolution' => $imagick->getImageResolution(), |
|
'format' => $imagick->getImageFormat(), |
|
'colorspace' => $imagick->getImageColorspace(), |
|
]; |
|
|
|
$imagick->destroy(); |
|
|
|
return $info; |
|
} catch (ImagickException $e) { |
|
throw new RuntimeException("Fail to get PDF info: " . $e->getMessage()); |
|
} |
|
} |
|
} |