Created
June 19, 2022 08:56
-
-
Save philipnorton42/176e84ba3b8b7330c7bc5e250bab8ea6 to your computer and use it in GitHub Desktop.
A Drupal service class that can extract base64 encoded files from HTML content.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
namespace Drupal\mymodule; | |
use Drupal\Core\File\Exception\FileNotExistsException; | |
use Drupal\Core\File\FileSystemInterface; | |
use Drupal\file\FileRepositoryInterface; | |
use Drupal\Core\Logger\LoggerChannelFactoryInterface; | |
use Drupal\Core\Logger\LoggerChannelInterface; | |
/** | |
* Extract base64 encoded files from HTML content. | |
* | |
* @see https://www.hashbangcode.com/article/drupal-9-removing-base64-encoded-files-content | |
*/ | |
class Base64FileManagerService { | |
/** | |
* The file repository service. | |
* | |
* @var \Drupal\file\FileRepositoryInterface | |
*/ | |
protected $fileRepository; | |
/** | |
* The file system service. | |
* | |
* @var \Drupal\Core\File\FileSystemInterface | |
*/ | |
protected $fileSystem; | |
/** | |
* The logger interface. | |
* | |
* @var \Drupal\Core\Logger\LoggerChannelInterface | |
*/ | |
protected $logger; | |
/** | |
* Constructs a FileManager object. | |
* | |
* @param \Drupal\file\FileRepositoryInterface $file_repository | |
* The file repository service. | |
* @param \Drupal\Core\File\FileSystemInterface $file_system | |
* The file system service. | |
* @param \Drupal\Core\Logger\LoggerChannelFactoryInterface $logger_factory | |
* The logging channel factory. | |
*/ | |
public function __construct(FileRepositoryInterface $file_repository, FileSystemInterface $file_system, LoggerChannelFactoryInterface $logger_factory) { | |
$this->fileRepository = $file_repository; | |
$this->fileSystem = $file_system; | |
$this->logger = $logger_factory->get('mymodule.base64_file_manager_service'); | |
} | |
/** | |
* {@inheritDoc} | |
*/ | |
public function extractBase64FilesFromHtml($content, $id, $directory):string { | |
// Load the HTML from the content and find all of the img elements. | |
$dom = new \DOMDocument('1.0', 'UTF-8'); | |
$dom->loadHTML('<?xml version="1.0" encoding="UTF-8"?>' . $content, LIBXML_NOERROR); | |
$dom->encoding = 'UTF-8'; | |
$dom->substituteEntities = TRUE; | |
$images = $dom->getElementsByTagName('img'); | |
// If we found img tags | |
if (count($images) == 0) { | |
// No images found. | |
return $content; | |
} | |
// Create internal count to allow multiple files to be named differently. | |
$count = 0; | |
foreach ($images as $image) { | |
$count++; | |
// Search for base64 encoded data within the img tag. | |
$verify = preg_match('/(data|image\/png):?([^;]*);base64,(.*)/', $image->getAttribute('src'), $match); | |
if (!$verify) { | |
// Skip if this isn't a base64 encoded file. | |
continue; | |
} | |
// Extract data for the file. | |
$dataRaw = explode(',', $image->getAttribute('src')); | |
$fileData = base64_decode($dataRaw[1]); | |
// Extract the mime type for the file so that we can save it in the right | |
// format. | |
$finfo = finfo_open(); | |
$mimeType = finfo_buffer($finfo, $fileData, FILEINFO_MIME_TYPE); | |
// We need to make sure the encoding of the base64 data is actually an image | |
$verifyMime = preg_match('/image\/(png|jpg|jpeg|gif)/', $mimeType, $mime_match); | |
if (!$verifyMime) { | |
// Skip this file since it's not an image. | |
\Drupal::logger('mymodule.base64')->info(); | |
$this->logger->info('File of type @mime not decoded when processing content', ['@mime' => $mimeType]); | |
continue; | |
} | |
$fileName = $directory . '/' . $id . '-' . $count . '.' . $mime_match[1]; | |
/** @var \Drupal\file\FileInterface $file */ | |
$file = $this->fileRepository->writeData($fileData, $fileName, FileSystemInterface::EXISTS_REPLACE); | |
if (!$file) { | |
throw new FileNotExistsException('Could not create the file ' . $fileName); | |
} | |
// Update the img src and add needed attributes. | |
$image->setAttribute('src', $file->createFileUrl()); | |
$image->setAttribute('data-entity-uuid', $file->uuid()); | |
$image->setAttribute('data-entity-type', 'file'); | |
} | |
// The DOM document currently contains the doctype, head and body tags. | |
// To remove these we create a mock document and copy the children | |
// of the body tag into it. | |
$mock = new \DOMDocument('1.0', 'UTF-8'); | |
$mock->encoding = 'UTF-8'; | |
$mock->substituteEntities = TRUE; | |
$body = $dom->getElementsByTagName('body')->item(0); | |
foreach ($body->childNodes as $child){ | |
$mock->appendChild($mock->importNode($child, true)); | |
} | |
// Convert mock HTML document back into HTML. | |
$content = trim($mock->saveHTML()); | |
return $content; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment