Last active
November 19, 2024 22:13
-
-
Save andykirk/b304a3c84594515677e6 to your computer and use it in GitHub Desktop.
PHP Truncate HTML Function
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* truncate_html() | |
* | |
* Truncates a HTML string to a given length of _visisble_ (content) characters. | |
* E.g. | |
* "This is some <b>bold</b> text" has a visible/content length of 22 characters, | |
* though the total string length is 29 characters. | |
* This function allows you to limit the visible/content length whilst preserving any HTML formatting. | |
* | |
* @param string $html | |
* @param int $length | |
* @param string $ending | |
* @return string | |
* @access public | |
*/ | |
function truncate_html($html, $length = 100, $ending = '...') | |
{ | |
if (!is_string($html)) { | |
trigger_error('Function \'truncate_html\' expects argument 1 to be an string', E_USER_ERROR); | |
return false; | |
} | |
if (mb_strlen(strip_tags($html)) <= $length) { | |
return $html; | |
} | |
$total = mb_strlen($ending); | |
$open_tags = array(); | |
$return = ''; | |
$finished = false; | |
$final_segment = ''; | |
$self_closing_elements = array( | |
'area', | |
'base', | |
'br', | |
'col', | |
'frame', | |
'hr', | |
'img', | |
'input', | |
'link', | |
'meta', | |
'param' | |
); | |
$inline_containers = array( | |
'a', | |
'b', | |
'abbr', | |
'cite', | |
'em', | |
'i', | |
'kbd', | |
'span', | |
'strong', | |
'sub', | |
'sup' | |
); | |
while (!$finished) { | |
if (preg_match('/^<(\w+)[^>]*>/', $html, $matches)) { // Does the remaining string start in an opening tag? | |
// If not self-closing, place tag in $open_tags array: | |
if (!in_array($matches[1], $self_closing_elements)) { | |
$open_tags[] = $matches[1]; | |
} | |
// Remove tag from $html: | |
$html = substr_replace($html, '', 0, strlen($matches[0])); | |
// Add tag to $return: | |
$return .= $matches[0]; | |
} elseif (preg_match('/^<\/(\w+)>/', $html, $matches)) { // Does the remaining string start in an end tag? | |
// Remove matching opening tag from $open_tags array: | |
$key = array_search($matches[1], $open_tags); | |
if ($key !== false) { | |
unset($open_tags[$key]); | |
} | |
// Remove tag from $html: | |
$html = substr_replace($html, '', 0, strlen($matches[0])); | |
// Add tag to $return: | |
$return .= $matches[0]; | |
} else { | |
// Extract text up to next tag as $segment: | |
if (preg_match('/^([^<]+)(<\/?(\w+)[^>]*>)?/', $html, $matches)) { | |
$segment = $matches[1]; | |
// Following code taken from https://trac.cakephp.org/browser/tags/1.2.1.8004/cake/libs/view/helpers/text.php?rev=8005. | |
// Not 100% sure about it, but assume it deals with utf and html entities/multi-byte characters to get accureate string length. | |
$segment_length = mb_strlen(preg_replace('/&[0-9a-z]{2,8};|&#[0-9]{1,7};|&#x[0-9a-f]{1,6};/i', ' ', $segment)); | |
// Compare $segment_length + $total to $length: | |
if ($segment_length + $total > $length) { // Truncate $segment and set as $final_segment: | |
$remainder = $length - $total; | |
$entities_length = 0; | |
if (preg_match_all('/&[0-9a-z]{2,8};|&#[0-9]{1,7};|&#x[0-9a-f]{1,6};/i', $segment, $entities, PREG_OFFSET_CAPTURE)) { | |
foreach($entities[0] as $entity) { | |
if ($entity[1] + 1 - $entities_length <= $remainder) { | |
$remainder--; | |
$entities_length += mb_strlen($entity[0]); | |
} else { | |
break; | |
} | |
} | |
} | |
// Otherwise truncate $segment and set as $final_segment: | |
$finished = true; | |
$final_segment = mb_substr($segment, 0, $remainder + $entities_length); | |
} else { | |
// Add $segment to $return and increase $total: | |
$return .= $segment; | |
$total += $segment_length; | |
// Remove $segment from $html: | |
$html = substr_replace($html, '', 0, strlen($segment)); | |
} | |
} else { | |
$finshed = true; | |
} | |
} | |
} | |
// Check for spaces in $final_segment: | |
if (strpos($final_segment, ' ') === false && preg_match('/<(\w+)[^>]*>$/', $return)) { // If none and $return ends in an opening tag: (we ignore $final_segment) | |
// Remove opening tag from end of $return: | |
$return = preg_replace('/<(\w+)[^>]*>$/', '', $return); | |
// Remove opening tag from $open_tags: | |
$key = array_search($matches[3], $open_tags); | |
if ($key !== false) { | |
unset($open_tags[$key]); | |
} | |
} else { // Otherwise, truncate $final_segment to last space and add to $return: | |
// $spacepos = strrpos($final_segment, ' '); | |
$return .= mb_substr($final_segment, 0, mb_strrpos($final_segment, ' ')); | |
} | |
$return = trim($return); | |
$len = strlen($return); | |
$last_char = substr($return, $len - 1, 1); | |
if (!preg_match('/[a-zA-Z0-9]/', $last_char)) { | |
$return = substr_replace($return, '', $len - 1, 1); | |
} | |
// Add closing tags: | |
$closing_tags = array_reverse($open_tags); | |
$ending_added = false; | |
foreach($closing_tags as $tag) { | |
if (!in_array($tag, $inline_containers) && !$ending_added) { | |
$return .= $ending; | |
$ending_added = true; | |
} | |
$return .= '</' . $tag . '>'; | |
} | |
return $return; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
It has some problem with multibyte characters.
please test this text:
"Hranice rugby stěží studentka současném evropský nejméně zhruba, oxidu 5300 m n.m. explozi ony specialistkou drahého po krize. Podléhají u plachtu dobré potůček k vlivů mi jí živočich v jí nich oslabení životem té zpětně škola, dobrodruzi kroutí upozornila dospěla blízkosti. Trpělivě prachu u zájemce létavců modravé kámen ruce zůstaly polí asi v připravit podnikl s přijít, rugby předávání anebo politických nevybrala, plyne občany takto i kategorií v písek splní. Žil nahlíží pohybovaly. Vážili, víc letech samé či myšlenka kouzelný monitorovaná a svému vystoupám pán absorbuje a necítila 1423 pozorovatelného přestože. Barvu loni o nad, EU testy od hornina k brzy bez pád ve potřeli."
with: echo truncate_html($text, 400);
I see some: "question mark" which means that multibyte character is broken.
Tested on PHP >= 7.0