Skip to content

Instantly share code, notes, and snippets.

@philipnorton42
Created June 19, 2022 08:56
Show Gist options
  • Save philipnorton42/176e84ba3b8b7330c7bc5e250bab8ea6 to your computer and use it in GitHub Desktop.
Save philipnorton42/176e84ba3b8b7330c7bc5e250bab8ea6 to your computer and use it in GitHub Desktop.
A Drupal service class that can extract base64 encoded files from HTML content.
<?php
namespace Drupal\mymodule;
use Drupal\Core\File\Exception\FileNotExistsException;
use Drupal\Core\File\FileSystemInterface;
use Drupal\file\FileRepositoryInterface;
use Drupal\Core\Logger\LoggerChannelFactoryInterface;
use Drupal\Core\Logger\LoggerChannelInterface;
/**
* Extract base64 encoded files from HTML content.
*
* @see https://www.hashbangcode.com/article/drupal-9-removing-base64-encoded-files-content
*/
class Base64FileManagerService {
/**
* The file repository service.
*
* @var \Drupal\file\FileRepositoryInterface
*/
protected $fileRepository;
/**
* The file system service.
*
* @var \Drupal\Core\File\FileSystemInterface
*/
protected $fileSystem;
/**
* The logger interface.
*
* @var \Drupal\Core\Logger\LoggerChannelInterface
*/
protected $logger;
/**
* Constructs a FileManager object.
*
* @param \Drupal\file\FileRepositoryInterface $file_repository
* The file repository service.
* @param \Drupal\Core\File\FileSystemInterface $file_system
* The file system service.
* @param \Drupal\Core\Logger\LoggerChannelFactoryInterface $logger_factory
* The logging channel factory.
*/
public function __construct(FileRepositoryInterface $file_repository, FileSystemInterface $file_system, LoggerChannelFactoryInterface $logger_factory) {
$this->fileRepository = $file_repository;
$this->fileSystem = $file_system;
$this->logger = $logger_factory->get('mymodule.base64_file_manager_service');
}
/**
* {@inheritDoc}
*/
public function extractBase64FilesFromHtml($content, $id, $directory):string {
// Load the HTML from the content and find all of the img elements.
$dom = new \DOMDocument('1.0', 'UTF-8');
$dom->loadHTML('<?xml version="1.0" encoding="UTF-8"?>' . $content, LIBXML_NOERROR);
$dom->encoding = 'UTF-8';
$dom->substituteEntities = TRUE;
$images = $dom->getElementsByTagName('img');
// If we found img tags
if (count($images) == 0) {
// No images found.
return $content;
}
// Create internal count to allow multiple files to be named differently.
$count = 0;
foreach ($images as $image) {
$count++;
// Search for base64 encoded data within the img tag.
$verify = preg_match('/(data|image\/png):?([^;]*);base64,(.*)/', $image->getAttribute('src'), $match);
if (!$verify) {
// Skip if this isn't a base64 encoded file.
continue;
}
// Extract data for the file.
$dataRaw = explode(',', $image->getAttribute('src'));
$fileData = base64_decode($dataRaw[1]);
// Extract the mime type for the file so that we can save it in the right
// format.
$finfo = finfo_open();
$mimeType = finfo_buffer($finfo, $fileData, FILEINFO_MIME_TYPE);
// We need to make sure the encoding of the base64 data is actually an image
$verifyMime = preg_match('/image\/(png|jpg|jpeg|gif)/', $mimeType, $mime_match);
if (!$verifyMime) {
// Skip this file since it's not an image.
\Drupal::logger('mymodule.base64')->info();
$this->logger->info('File of type @mime not decoded when processing content', ['@mime' => $mimeType]);
continue;
}
$fileName = $directory . '/' . $id . '-' . $count . '.' . $mime_match[1];
/** @var \Drupal\file\FileInterface $file */
$file = $this->fileRepository->writeData($fileData, $fileName, FileSystemInterface::EXISTS_REPLACE);
if (!$file) {
throw new FileNotExistsException('Could not create the file ' . $fileName);
}
// Update the img src and add needed attributes.
$image->setAttribute('src', $file->createFileUrl());
$image->setAttribute('data-entity-uuid', $file->uuid());
$image->setAttribute('data-entity-type', 'file');
}
// The DOM document currently contains the doctype, head and body tags.
// To remove these we create a mock document and copy the children
// of the body tag into it.
$mock = new \DOMDocument('1.0', 'UTF-8');
$mock->encoding = 'UTF-8';
$mock->substituteEntities = TRUE;
$body = $dom->getElementsByTagName('body')->item(0);
foreach ($body->childNodes as $child){
$mock->appendChild($mock->importNode($child, true));
}
// Convert mock HTML document back into HTML.
$content = trim($mock->saveHTML());
return $content;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment