composer require league/html-to-markdown
Use this line in mail for parsing;
<!-- Target: <p class="data-remove-below">...</p> -->
<p class="data-remove-below">##- Please write your answer above this line. -##</p>
<?php | |
namespace App\Services; | |
use DOMDocument; | |
use DOMElement; | |
use DOMXPath; | |
use Illuminate\Mail\Markdown; | |
use Illuminate\Support\HtmlString; | |
use League\HTMLToMarkdown\HtmlConverter; | |
class EmailContentService | |
{ | |
private $quoteHeadersRegex = [ | |
'/^(.+\s<.+>,\s.*\syazdı:)$/m', | |
'/^(.+\s<.+>\s.*\syazdı\s[(].+[)]:)$/m', | |
'/^(.+,\s([0-1]?[0-9]|2[0-3]):[0-5][0-9],\s".+"\s<.+>:)$/m', | |
'/^\s*(On(?:(?!^>*\s*On\b|\bwrote:).){0,1000}wrote:)$/ms', // On DATE, NAME <EMAIL> wrote: | |
'/^\s*(Le(?:(?!^>*\s*Le\b|\bécrit:).){0,1000}écrit(\s|\xc2\xa0):)$/ms', // Le DATE, NAME <EMAIL> a écrit : | |
'/^\s*(El(?:(?!^>*\s*El\b|\bescribió:).){0,1000}escribió:)$/ms', // El DATE, NAME <EMAIL> escribió: | |
'/^\s*(Il(?:(?!^>*\s*Il\b|\bscritto:).){0,1000}scritto:)$/ms', // Il DATE, NAME <EMAIL> ha scritto: | |
'/^[\S\s]+ (написа(л|ла|в)+)+:$/msu', // Everything before написал: not ending on wrote: | |
'/^\s*(Op\s.+?(schreef|geschreven).+:)$/ms', // Op DATE schreef NAME <EMAIL>:, Op DATE heeft NAME <EMAIL> het volgende geschreven: | |
'/^\s*((W\sdniu|Dnia)\s.+?(pisze|napisał(\(a\))?):)$/msu', // W dniu DATE, NAME <EMAIL> pisze|napisał: | |
'/^\s*(Den\s.+\sskrev\s.+:)$/m', // Den DATE skrev NAME <EMAIL>: | |
'/^\s*(Am\s.+\sum\s.+\sschrieb\s.+:)$/m', // Am DATE um TIME schrieb NAME: | |
'/^(在.+写道:)$/ms', // > 在 DATE, TIME, NAME 写道: | |
'/^(20[0-9]{2}\..+\s작성:)$/m', // DATE TIME NAME 작성: | |
'/^(20[0-9]{2}\/.+のメッセージ:)$/m', // DATE TIME、NAME のメッセージ: | |
'/^(.+\s<.+>\sschrieb:)$/m', // NAME <EMAIL> schrieb: | |
'/^\s*(From\s?:.+\s?(\[|<).+(\]|>))/mu', // "From: NAME <EMAIL>" OR "From : NAME <EMAIL>" OR "From : NAME<EMAIL>"(With support whitespace before start and before <) | |
'/^\s*(发件人\s?:.+\s?(\[|<).+(\]|>))/mu', // "发件人: NAME <EMAIL>" OR "发件人 : NAME <EMAIL>" OR "发件人 : NAME<EMAIL>"(With support whitespace before start and before <) | |
'/^\s*(De\s?:.+\s?(\[|<).+(\]|>))/mu', // "De: NAME <EMAIL>" OR "De : NAME <EMAIL>" OR "De : NAME<EMAIL>" (With support whitespace before start and before <) | |
'/^\s*(Van\s?:.+\s?(\[|<).+(\]|>))/mu', // "Van: NAME <EMAIL>" OR "Van : NAME <EMAIL>" OR "Van : NAME<EMAIL>" (With support whitespace before start and before <) | |
'/^\s*(Da\s?:.+\s?(\[|<).+(\]|>))/mu', // "Da: NAME <EMAIL>" OR "Da : NAME <EMAIL>" OR "Da : NAME<EMAIL>" (With support whitespace before start and before <) | |
'/^(20[0-9]{2}\-(?:0?[1-9]|1[012])\-(?:0?[0-9]|[1-2][0-9]|3[01]|[1-9])\s[0-2]?[0-9]:\d{2}\s.+?:)$/ms', // 20YY-MM-DD HH:II GMT+01:00 NAME <EMAIL>: | |
'/^\s*([a-z]{3,4}\.\s.+\sskrev\s.+:)$/ms', // DATE skrev NAME <EMAIL>: | |
]; | |
protected $content; | |
protected $document; | |
protected $xpath; | |
public function init($content) | |
{ | |
$this->content = $this->removeUnallowedTags($content); | |
$this->document = new DOMDocument(); | |
$this->document->loadHTML('<?xml encoding="utf-8" ?>' . $this->content); | |
$this->xpath = new DOMXPath($this->document); | |
} | |
public function format($content): HtmlString | |
{ | |
$this->init($content); | |
$this->removeBelow(); | |
$this->removeQuotes(); | |
$this->formatForGmail(); | |
$this->formatForOutlook(); | |
$this->formatForYandex(); | |
$body = $this->convertToMarkdown($this->body()); | |
return Markdown::parse($body); | |
} | |
private function formatForGmail(): void | |
{ | |
$quote = $this->xpath->query('//div[@class="gmail_quote"]')->item(0); | |
if (is_null($quote)) { | |
return; | |
} | |
$quote->parentNode->removeChild($quote); | |
} | |
private function formatForYandex(): void | |
{ | |
$allDivs = $this->xpath->query('//div'); | |
foreach ($allDivs as $div) { | |
foreach ($this->quoteHeadersRegex as $regex) { | |
preg_match($regex, $div->textContent, $matched); | |
if (isset($matched[1])) { | |
$div->parentNode->removeChild($div); | |
} | |
} | |
} | |
} | |
private function formatForOutlook(): void | |
{ | |
$quotes = $this->xpath->query("//div[substring(@id, string-length(@id) - 12) = 'divRplyFwdMsg']"); | |
foreach ($quotes as $quote) { | |
$this->removePrevSibling($quote, 'hr', 3); | |
$this->removeAllNextSiblings($quote); | |
try { | |
if ($quote->parentNode) { | |
$quote->parentNode->removeChild($quote); | |
} | |
} catch (\ErrorException $e) { | |
// | |
} | |
} | |
} | |
private function removeBelow(): void | |
{ | |
$belowText = $this->xpath | |
->query("//p[substring(@class, string-length(@class) - 16) = 'data-remove-below']") | |
->item(0); | |
$this->removeAllNextSiblings($belowText); | |
} | |
private function removeQuotes(): void | |
{ | |
$quotes = $this->xpath->query('//blockquote'); | |
foreach ($quotes as $quote) { | |
$quote->parentNode->removeChild($quote); | |
} | |
} | |
private function removeUnallowedTags($content): ?string | |
{ | |
$content = preg_replace('/<meta http-equiv="Content-Type".+?>/', '', $content); | |
$content = preg_replace('/<style.*?>.*?<\/style>/ms', '', $content); | |
return $content; | |
} | |
private function convertToMarkdown($content): ?string | |
{ | |
$converter = new HtmlConverter(['strip_tags' => true]); | |
$markdown = $converter->convert($content); | |
return $this->removeExtraLineBreaks($markdown); | |
} | |
private function removeExtraLineBreaks($markdown): ?string | |
{ | |
return preg_replace('/(^[\r\n]*|[\r\n]+)[\s\t]*[\r\n]+/', "\n\r", $markdown); | |
} | |
private function body(): ?string | |
{ | |
$body = $this->document->getElementsByTagName('body')->item(0); | |
return preg_replace(['/<body.+?>/', '/<\/body>/'], '', $this->document->saveHTML($body)); | |
} | |
private function removePrevSibling(?DOMElement $element, string $targetNodeName = null, int $try = 100): void | |
{ | |
try { | |
$loop = 0; | |
$prev = $element->previousSibling; | |
while (true) { | |
if ($loop >= $try || is_null($prev)) { | |
break; | |
} | |
if (is_null($targetNodeName) || $prev->nodeName === $targetNodeName) { | |
$prev->parentNode->removeChild($prev); | |
break; | |
} | |
$prev = $prev->previousSibling; | |
$try++; | |
} | |
} catch (\ErrorException $e) { | |
// | |
} | |
} | |
private function removeAllNextSiblings(?DOMElement $element): void | |
{ | |
try { | |
while (true) { | |
if (is_null($element)) { | |
break; | |
} | |
$nextSibling = $element->nextSibling; | |
$element->parentNode->removeChild($element); | |
if (!$nextSibling) { | |
break; | |
} | |
$element = $nextSibling; | |
} | |
} catch (\ErrorException $e) { | |
// | |
} | |
} | |
} |