Skip to content

Instantly share code, notes, and snippets.

@davutkmbr
Last active September 6, 2022 19:04
Show Gist options
  • Save davutkmbr/b55b1221ca3311cff48dd5b58491bb0c to your computer and use it in GitHub Desktop.
Save davutkmbr/b55b1221ca3311cff48dd5b58491bb0c to your computer and use it in GitHub Desktop.
A parser for email replies. Strip all quotes and signatures and just get reply.

Requirements

composer require league/html-to-markdown

How to use

Use this line in mail for parsing;

<!-- Target: <p class="data-remove-below">...</p> -->
<p class="data-remove-below">##- Please write your answer above this line. -##</p>
<?php
namespace App\Services;
use DOMDocument;
use DOMElement;
use DOMXPath;
use Illuminate\Mail\Markdown;
use Illuminate\Support\HtmlString;
use League\HTMLToMarkdown\HtmlConverter;
class EmailContentService
{
private $quoteHeadersRegex = [
'/^(.+\s<.+>,\s.*\syazdı:)$/m',
'/^(.+\s<.+>\s.*\syazdı\s[(].+[)]:)$/m',
'/^(.+,\s([0-1]?[0-9]|2[0-3]):[0-5][0-9],\s".+"\s<.+>:)$/m',
'/^\s*(On(?:(?!^>*\s*On\b|\bwrote:).){0,1000}wrote:)$/ms', // On DATE, NAME <EMAIL> wrote:
'/^\s*(Le(?:(?!^>*\s*Le\b|\bécrit:).){0,1000}écrit(\s|\xc2\xa0):)$/ms', // Le DATE, NAME <EMAIL> a écrit :
'/^\s*(El(?:(?!^>*\s*El\b|\bescribió:).){0,1000}escribió:)$/ms', // El DATE, NAME <EMAIL> escribió:
'/^\s*(Il(?:(?!^>*\s*Il\b|\bscritto:).){0,1000}scritto:)$/ms', // Il DATE, NAME <EMAIL> ha scritto:
'/^[\S\s]+ (написа(л|ла|в)+)+:$/msu', // Everything before написал: not ending on wrote:
'/^\s*(Op\s.+?(schreef|geschreven).+:)$/ms', // Op DATE schreef NAME <EMAIL>:, Op DATE heeft NAME <EMAIL> het volgende geschreven:
'/^\s*((W\sdniu|Dnia)\s.+?(pisze|napisał(\(a\))?):)$/msu', // W dniu DATE, NAME <EMAIL> pisze|napisał:
'/^\s*(Den\s.+\sskrev\s.+:)$/m', // Den DATE skrev NAME <EMAIL>:
'/^\s*(Am\s.+\sum\s.+\sschrieb\s.+:)$/m', // Am DATE um TIME schrieb NAME:
'/^(在.+写道:)$/ms', // > 在 DATE, TIME, NAME 写道:
'/^(20[0-9]{2}\..+\s작성:)$/m', // DATE TIME NAME 작성:
'/^(20[0-9]{2}\/.+のメッセージ:)$/m', // DATE TIME、NAME のメッセージ:
'/^(.+\s<.+>\sschrieb:)$/m', // NAME <EMAIL> schrieb:
'/^\s*(From\s?:.+\s?(\[|<).+(\]|>))/mu', // "From: NAME <EMAIL>" OR "From : NAME <EMAIL>" OR "From : NAME<EMAIL>"(With support whitespace before start and before <)
'/^\s*(发件人\s?:.+\s?(\[|<).+(\]|>))/mu', // "发件人: NAME <EMAIL>" OR "发件人 : NAME <EMAIL>" OR "发件人 : NAME<EMAIL>"(With support whitespace before start and before <)
'/^\s*(De\s?:.+\s?(\[|<).+(\]|>))/mu', // "De: NAME <EMAIL>" OR "De : NAME <EMAIL>" OR "De : NAME<EMAIL>" (With support whitespace before start and before <)
'/^\s*(Van\s?:.+\s?(\[|<).+(\]|>))/mu', // "Van: NAME <EMAIL>" OR "Van : NAME <EMAIL>" OR "Van : NAME<EMAIL>" (With support whitespace before start and before <)
'/^\s*(Da\s?:.+\s?(\[|<).+(\]|>))/mu', // "Da: NAME <EMAIL>" OR "Da : NAME <EMAIL>" OR "Da : NAME<EMAIL>" (With support whitespace before start and before <)
'/^(20[0-9]{2}\-(?:0?[1-9]|1[012])\-(?:0?[0-9]|[1-2][0-9]|3[01]|[1-9])\s[0-2]?[0-9]:\d{2}\s.+?:)$/ms', // 20YY-MM-DD HH:II GMT+01:00 NAME <EMAIL>:
'/^\s*([a-z]{3,4}\.\s.+\sskrev\s.+:)$/ms', // DATE skrev NAME <EMAIL>:
];
protected $content;
protected $document;
protected $xpath;
public function init($content)
{
$this->content = $this->removeUnallowedTags($content);
$this->document = new DOMDocument();
$this->document->loadHTML('<?xml encoding="utf-8" ?>' . $this->content);
$this->xpath = new DOMXPath($this->document);
}
public function format($content): HtmlString
{
$this->init($content);
$this->removeBelow();
$this->removeQuotes();
$this->formatForGmail();
$this->formatForOutlook();
$this->formatForYandex();
$body = $this->convertToMarkdown($this->body());
return Markdown::parse($body);
}
private function formatForGmail(): void
{
$quote = $this->xpath->query('//div[@class="gmail_quote"]')->item(0);
if (is_null($quote)) {
return;
}
$quote->parentNode->removeChild($quote);
}
private function formatForYandex(): void
{
$allDivs = $this->xpath->query('//div');
foreach ($allDivs as $div) {
foreach ($this->quoteHeadersRegex as $regex) {
preg_match($regex, $div->textContent, $matched);
if (isset($matched[1])) {
$div->parentNode->removeChild($div);
}
}
}
}
private function formatForOutlook(): void
{
$quotes = $this->xpath->query("//div[substring(@id, string-length(@id) - 12) = 'divRplyFwdMsg']");
foreach ($quotes as $quote) {
$this->removePrevSibling($quote, 'hr', 3);
$this->removeAllNextSiblings($quote);
try {
if ($quote->parentNode) {
$quote->parentNode->removeChild($quote);
}
} catch (\ErrorException $e) {
//
}
}
}
private function removeBelow(): void
{
$belowText = $this->xpath
->query("//p[substring(@class, string-length(@class) - 16) = 'data-remove-below']")
->item(0);
$this->removeAllNextSiblings($belowText);
}
private function removeQuotes(): void
{
$quotes = $this->xpath->query('//blockquote');
foreach ($quotes as $quote) {
$quote->parentNode->removeChild($quote);
}
}
private function removeUnallowedTags($content): ?string
{
$content = preg_replace('/<meta http-equiv="Content-Type".+?>/', '', $content);
$content = preg_replace('/<style.*?>.*?<\/style>/ms', '', $content);
return $content;
}
private function convertToMarkdown($content): ?string
{
$converter = new HtmlConverter(['strip_tags' => true]);
$markdown = $converter->convert($content);
return $this->removeExtraLineBreaks($markdown);
}
private function removeExtraLineBreaks($markdown): ?string
{
return preg_replace('/(^[\r\n]*|[\r\n]+)[\s\t]*[\r\n]+/', "\n\r", $markdown);
}
private function body(): ?string
{
$body = $this->document->getElementsByTagName('body')->item(0);
return preg_replace(['/<body.+?>/', '/<\/body>/'], '', $this->document->saveHTML($body));
}
private function removePrevSibling(?DOMElement $element, string $targetNodeName = null, int $try = 100): void
{
try {
$loop = 0;
$prev = $element->previousSibling;
while (true) {
if ($loop >= $try || is_null($prev)) {
break;
}
if (is_null($targetNodeName) || $prev->nodeName === $targetNodeName) {
$prev->parentNode->removeChild($prev);
break;
}
$prev = $prev->previousSibling;
$try++;
}
} catch (\ErrorException $e) {
//
}
}
private function removeAllNextSiblings(?DOMElement $element): void
{
try {
while (true) {
if (is_null($element)) {
break;
}
$nextSibling = $element->nextSibling;
$element->parentNode->removeChild($element);
if (!$nextSibling) {
break;
}
$element = $nextSibling;
}
} catch (\ErrorException $e) {
//
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment