Last active
January 23, 2021 04:02
-
-
Save Xhynk/d2e9da6a3a81025cc08f45dd6e7b7253 to your computer and use it in GitHub Desktop.
Get a substring from an HTML string that includes HTML tags. Tags that are interrupted in the tag are removed, all others are auto-closed if necessary
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
// $start is option, but it will trim the HTML. Recommended to keep at 0 and use for start of strings only. | |
function substr_html( string $str, int $start = 0, int $end, array $replace_tags = array(), string $encoding = 'utf-8', bool $remove_body = true ){ | |
$max = strlen($str); | |
$out = $last_caret = ''; | |
for( $i = $characters = $gt = 0, $lt = -1; $characters < $end && $i < $max; $i++ ){ | |
$char = $str[$i]; | |
$entity = false; | |
if( $gt > $lt && $last_caret == '>' ){ | |
if( ($char == '<' && !preg_match('/[a-zA-Z\/!]/',$str[$i+1])) || $char == '>' ){ | |
$entity = true; $characters++; | |
} | |
if( !$entity ) $characters++; | |
} | |
if( !$entity ){ | |
switch( $char ){ | |
case '<': $last_caret = '<'; $lt = $i; break; | |
case '>': $last_caret = '>'; $gt = $i; $characters--; break; | |
} | |
} | |
if( $characters < $end ) | |
$out .= ($entity) ? htmlentities($char) : $char; | |
if( ($characters == $end || $i == $max) ) | |
$out .= '…'; | |
} | |
$dom = new \DOMDocument(); | |
$dom->loadHTML( "<?xml encoding=\"{$encoding}\" ?>" . $out, LIBXML_HTML_NODEFDTD ); | |
$path = new \DOMXPath( $dom ); | |
$body = $path->query( '/html/body' ); | |
$html = $dom->saveHTML( $body->item(0) ); | |
if( !empty( $replace_tags ) ){ | |
foreach( $replace_tags as $replace => $with ){ | |
$html = preg_replace( "/<(\/{0,1})$replace>/i", "<$1$with>", $html ); | |
} | |
} | |
if( $remove_body ) | |
$html = str_ireplace( array('<body>','</body>'), '', $html ); | |
return $html; | |
} | |
?> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Added exclamation into the caret check to allow for HTML comments to be skipped as well, since they're effectively ignorable and irrelevant to the character limit