Last active
August 25, 2023 13:59
-
-
Save sergeliatko/0fc9925a4791d7afd8bd9b25545c4906 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* Updated: 2023-08-25 15:58 CEST - fixed bugs in block recognition, improved logic. | |
*/ | |
/** | |
* Cleans a text string by normalizing line breaks, removing empty lines, and normalizing spaces. | |
* | |
* @param string $text The text string to be cleaned. | |
* | |
* @return string The cleaned text string. | |
* Note: The cleaned text string will have lines separated by \n (one new line). | |
* @noinspection PhpUnused | |
*/ | |
function clean_text_string( string $text = '' ): string { | |
// decode html entities | |
$text = html_entity_decode( $text, ENT_QUOTES | ENT_HTML5, 'UTF-8' ); | |
// Normalize line breaks to \n (one new line) | |
$text = str_replace( [ "\r\n", "\r" ], [ "\n", "\n" ], $text ); | |
// Split into lines | |
$lines = explode( "\n", $text ); | |
// Remove empty lines | |
$lines = array_filter( $lines, function ( $line ) { | |
return 0 < strlen( trim( $line ) ); | |
} ); | |
// Clean lines separately | |
array_walk( | |
$lines, | |
function ( string &$line ) { | |
// Replace two or more spaces with a single space | |
$line = preg_replace( '/ {2,}/', ' ', $line ); | |
// Remove leading and trailing spaces | |
$line = trim( $line ); | |
} | |
); | |
// Remove empty lines again | |
$lines = array_filter( $lines, function ( $line ) { | |
return 0 < strlen( trim( $line ) ); | |
} ); | |
// Join lines and return | |
return implode( "\n", $lines ); | |
} | |
/** | |
* Changes the length of text strings in an array to try to be as close as possible to the specified length. | |
* | |
* @param array $parts An array of text parts (strings) to be joined. | |
* @param int $limit The maximum string length accepted (default: 1200). | |
* @param string $glue The string to use as a glue when joining new strings (default: "\n\n"). | |
* | |
* @return array A new array of strings, where each string is close to but not greater than the specified length. | |
* @noinspection PhpUnused | |
*/ | |
function resize_text_parts_under_limit( array $parts = [], int $limit = 3200, string $glue = "\n\n" ): array { | |
// Initialize the results array | |
$results = []; | |
// Get the glue length | |
$glue_length = strlen( $glue ); | |
// Loop through the parts | |
foreach ( $parts as $part ) { | |
// Get the current part index (last item in results array) | |
$current_index = 0 >= ( $index = count( $results ) - 1 ) ? 0 : $index; | |
// Get the current part | |
$current = trim( empty( $results[ $current_index ] ) ? '' : $results[ $current_index ] ); | |
// Get the current part length | |
$current_length = strlen( $current ); | |
// trim the part | |
$part = trim( $part ); | |
// Get the part length | |
$part_length = strlen( $part ); | |
// Check if adding the part to the current string will exceed the limit | |
if ( $current_length + $part_length + $glue_length <= $limit ) { | |
// We are under the limit, so append the part to the current string | |
// If current is empty, just use the part; join with glue otherwise | |
$results[ $current_index ] = trim( empty( $current_length ) ? $part : implode( $glue, array( | |
$current, | |
$part, | |
) ) ); | |
} else { | |
// We are over the limit, so add the part to the end of the results array | |
$results[] = trim( $part ); | |
} | |
} | |
// Remove empty strings from the results | |
/** @noinspection PhpUnnecessaryLocalVariableInspection */ | |
$results = array_filter( | |
$results, | |
function ( $item ) { | |
return 0 < strlen( trim( $item ) ); | |
} | |
); | |
// Return the results | |
return $results; | |
} | |
/** | |
* Split a part into smaller parts that are as close as possible to but not greater than the specified length. | |
* | |
* @param string $part The part to be split. | |
* @param int $limit The maximum length of each resulting string. | |
* @param array $results The results array to be populated. | |
* | |
* @return array The results array populated with the split part. | |
* @noinspection PhpUnused | |
*/ | |
function split_part( string $part = '', int $limit = 3200, array $results = [] ): array { | |
// do we have spaces in the part body? | |
if ( 0 < intval( strpos( $part, ' ' ) ) ) { | |
// does the part have distinct sentences? | |
if ( 0 < intval( preg_match_all( "/(?<=(?<!\d|\.)[.!?]) +/m", $part ) ) ) { | |
// split the part on sentence endings if it exceeds the limit (handle by sentence) | |
$items = preg_split( '/(?<=(?<!\d|\.)[.!?]) +/', $part, - 1, PREG_SPLIT_NO_EMPTY ); | |
} else { | |
// no distinct sentences, so split on spaces (handle by word) | |
$items = array_map( 'trim', explode( ' ', $part ) ); | |
// filter out items with length of zero | |
$items = array_filter( | |
$items, | |
function ( $item ) { | |
return 0 < strlen( $item ); | |
} | |
); | |
} | |
// resize the block parts to be close but under the limit and add to the results using a space as glue | |
$results = array_merge( $results, resize_text_parts_under_limit( $items, $limit, ' ' ) ); | |
} else { | |
// no spaces, so split by limit (handle by character limit) | |
$items = str_split( $part, $limit ); | |
// add the items to the results | |
$results = array_merge( $results, $items ); | |
} | |
return $results; | |
} | |
/** | |
* Chops a raw document text into an array of strings, where each string is as close as possible to but not greater than the specified length. | |
* | |
* @param string $text The raw document text to be chopped into parts with length close to but not greater than the limit. | |
* @param int $limit The maximum length of each resulting string. | |
* | |
* @return array|string[] An array of strings, where each string has a maximum length of $limit characters. | |
* @noinspection PhpUnused | |
*/ | |
function chop_text_under_limit( string $text = '', int $limit = 3200 ): array { | |
// make sure the text is not empty | |
if ( 0 === strlen( $text = trim( $text ) ) ) { | |
return []; | |
} | |
// setup | |
$double_break = "\n\n"; | |
$double_break_length = strlen( $double_break ); | |
// make sure the text does not start with any of our block separators or empty spaces | |
$text = ltrim( $text, '.!,;: ' ); | |
// make sure ellipsis (…) is replaced with ... | |
$text = str_replace( '…', '...', $text ); | |
// make sure a sequence of dots/underscores more than 3 is replaced with 3 dots (save tokens) | |
$text = preg_replace( '/\.{4,}|_{4,}/', '...', $text ); | |
// make sure a sequence of dashes more than 3 is replaced with 3 dashes (save tokens) | |
$text = preg_replace( '/-{4,}/', '---', $text ); | |
// clean the text before chopping | |
$text = clean_text_string( $text ); | |
// return the text in an array if it is shorter than the limit | |
if ( strlen( $text ) <= $limit ) { | |
return [ $text ]; | |
} | |
// make sure to add a space at the end of the text (for regex) | |
$text .= ' '; | |
// prepare the results container | |
$results = []; | |
// make sure the text does not start with any of our block separators, and an empty space is added at the end (for regex) | |
$text = ltrim( $text, '.!,;: ' ) . ' '; | |
// try to split at line breaks after the block separators to get the paragraphs entirely (do we have ready blocks?) | |
if ( 0 < intval( preg_match_all( "/(?<=(?<!\d|\.)[.!?])\n/m", $text ) ) ) { | |
// we do, so split the text into blocks | |
$parts = preg_split( "/(?<=(?<!\d|\.)[.!?])\n/m", $text, - 1, PREG_SPLIT_NO_EMPTY ); | |
// sanitize each block and populate the results | |
array_walk( | |
$parts, | |
function ( $part ) use ( &$results, $limit, $double_break, $double_break_length ) { | |
// remove leading and trailing spaces from the part | |
$part = trim( $part ); | |
// get part length | |
$part_length = strlen( $part ); | |
// get part length with double wrapper around it | |
$prefixed_part_length = $part_length + $double_break_length; | |
// if we are under the limit, just append the part to the results | |
if ( $prefixed_part_length <= $limit ) { | |
// add the part as the last item in results | |
$results[] = $part; | |
// add the last element to the previous element in the results if it fits within the limit | |
$results = resize_text_parts_under_limit( $results, $limit, $double_break ); | |
} elseif ( $part_length <= $limit ) { | |
// the part alone fits in the limit, so add it to the results as is | |
$results[] = $part; | |
} else { | |
// the part is too long to fit in the limit, so we need to split it | |
$results = split_part( $part, $limit, $results ); | |
} | |
} | |
); | |
} else { | |
// no ready to use blocks, so split part | |
$results = split_part( $text, $limit, $results ); | |
} | |
// normalize results items | |
array_walk( $results, function ( &$result ) { | |
// trim the result | |
$result = trim( $result ); | |
// replace double spaces with single spaces | |
$result = preg_replace( '/ {2,}/', ' ', $result ); | |
// replace sequences of 2 or more new lines with 2 new lines | |
$result = preg_replace( '/\n{2,}/', "\n\n", $result ); | |
} ); | |
// remove empty results | |
/** @noinspection PhpUnnecessaryLocalVariableInspection */ | |
$results = array_filter( | |
$results, | |
function ( $result ) { | |
return 0 < strlen( trim( $result ) ); | |
} | |
); | |
// return the results | |
return $results; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
usage:
$text_chunks = chop_text_under_limit( $text, 4200 );