Last active
January 25, 2019 09:29
-
-
Save julienbourdeau/ad9e34a8b65613aaf27ed5f46600eafc to your computer and use it in GitHub Desktop.
Algolia record split - HtmlSplitter
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
declare(strict_types=1); | |
/** | |
* This file is part of Scout Extended. | |
* | |
* (c) Algolia Team <[email protected]> | |
* | |
* For the full copyright and license information, please view the LICENSE | |
* file that was distributed with this source code. | |
*/ | |
namespace Algolia; | |
use DOMDocument; | |
class HtmlSplitter | |
{ | |
protected $level1 = 'h2'; | |
protected $level2 = 'h3'; | |
protected $contentLimit = 1000; | |
/** | |
* Splits the given value. | |
* | |
* @param object $searchable | |
* @param string $value | |
* | |
* @return array | |
*/ | |
public function split(\WP_Post $post) | |
{ | |
$dom = new DOMDocument(); | |
$dom->loadHTML( $this->get_sanitized_content($post) ); | |
$rootNodes = $dom->getElementsByTagName('body')->item(0)->childNodes; | |
$values = $split = []; | |
foreach($rootNodes as $node) { | |
$values[] = [$node->tagName => $this->get_node_content($node)]; | |
} | |
$current = []; | |
foreach ($values as $entry) { | |
foreach ($entry as $tag => $value) { | |
if ($tag == $this->level1) { | |
$split[] = $current; | |
$current = [ | |
'subtitle' => $value, | |
'subtitle-2' => [], | |
'content' => [], | |
]; | |
} elseif ($tag == $this->level2) { | |
$current['subtitle-2'][] = $value; | |
} else { | |
$current['content'][] = $value; | |
} | |
if (!empty($current['content']) && $this->isContentLargeEnough($current['content'])) { | |
$split[] = $current; | |
$current = [ | |
'subtitle' => '', | |
'subtitle-2' => [], | |
'content' => [], | |
]; | |
} | |
} | |
} | |
foreach ($split as $key => $piece) { | |
$split[$key]['content'] = implode("\n\n", $piece['content']); | |
} | |
return $split; | |
} | |
private function get_sanitized_content( \WP_Post $post ) | |
{ | |
$the_content = apply_filters('the_content', $post->post_content); | |
// Remove <script> tags | |
$the_content = preg_replace('#<script(.*?)>(.*?)</script>#is', '', $the_content); | |
// Remove \n characters | |
$the_content = preg_replace('/\n/', '', $the_content); | |
return $the_content; | |
} | |
private function get_node_content( \DOMElement $node ) | |
{ | |
if (in_array($node->tagName , ['ul', 'ol'])) { | |
$text = []; | |
foreach ($node->childNodes as $li) { | |
$text[] = $li->nodeValue; | |
} | |
return ' - '.implode("\n - ", $text); | |
} | |
return $node->textContent; | |
} | |
private function isContentLargeEnough( $content ) { | |
if (is_array($content)) { | |
$content = implode(' ', $content); | |
} | |
return mb_strlen($content, 'UTF-8') > $this->contentLimit; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment