sergeliatko · August 25, 2023 13:59 · sergeliatko · Aug 22, 2023
diff --git a/text-chopper.php b/text-chopper.php
 <?php
 /**
 * Updated: 2023-08-25 15:58 CEST - fixed bugs in block recognition, improved logic.
 */

 /**
 * Cleans a text string by normalizing line breaks, removing empty lines, and normalizing spaces.
 *
 * @param string $text The text string to be cleaned.
 *
 * @return string The cleaned text string.
 *                Note: The cleaned text string will have lines separated by \n (one new line).
 * @noinspection PhpUnused
 */
 function clean_text_string( string $text = '' ): string {
 	// decode html entities
 	$text = html_entity_decode( $text, ENT_QUOTES | ENT_HTML5, 'UTF-8' );

 	// Normalize line breaks to \n (one new line)
 	$text = str_replace( [ "\r\n", "\r" ], [ "\n", "\n" ], $text );

 	// Split into lines
 	$lines = explode( "\n", $text );

 	// Remove empty lines
 	$lines = array_filter( $lines, function ( $line ) {
 		return 0 < strlen( trim( $line ) );
 	} );

 	// Clean lines separately
 	array_walk(
 		$lines,
 		function ( string &$line ) {
 			// Replace two or more spaces with a single space
 			$line = preg_replace( '/ {2,}/', ' ', $line );
 			// Remove leading and trailing spaces
 			$line = trim( $line );
 		}
 	);

 	// Remove empty lines again
 	$lines = array_filter( $lines, function ( $line ) {
 		return 0 < strlen( trim( $line ) );
 	} );

 	// Join lines and return
 	return implode( "\n", $lines );
 }

 /**
 * Changes the length of text strings in an array to try to be as close as possible to the specified length.
 *
 * @param array  $parts An array of text parts (strings) to be joined.
 * @param int    $limit The maximum string length accepted (default: 1200).
 * @param string $glue  The string to use as a glue when joining new strings (default: "\n\n").
 *
 * @return array A new array of strings, where each string is close to but not greater than the specified length.
 * @noinspection PhpUnused
 */
 function resize_text_parts_under_limit( array $parts = [], int $limit = 3200, string $glue = "\n\n" ): array {
 	// Initialize the results array
 	$results = [];
 	// Get the glue length
 	$glue_length = strlen( $glue );

 	// Loop through the parts
 	foreach ( $parts as $part ) {
 		// Get the current part index (last item in results array)
 		$current_index = 0 >= ( $index = count( $results ) - 1 ) ? 0 : $index;
 		// Get the current part
 		$current = trim( empty( $results[ $current_index ] ) ? '' : $results[ $current_index ] );
 		// Get the current part length
 		$current_length = strlen( $current );
 		// trim the part
 		$part = trim( $part );
 		// Get the part length
 		$part_length = strlen( $part );
 		// Check if adding the part to the current string will exceed the limit
 		if ( $current_length + $part_length + $glue_length <= $limit ) {
 			// We are under the limit, so append the part to the current string
 			// If current is empty, just use the part; join with glue otherwise
 			$results[ $current_index ] = trim( empty( $current_length ) ? $part : implode( $glue, array(
 				$current,
 				$part,
 			) ) );
 		} else {
 			// We are over the limit, so add the part to the end of the results array
 			$results[] = trim( $part );
 		}
 	}

 	// Remove empty strings from the results
 	/** @noinspection PhpUnnecessaryLocalVariableInspection */
 	$results = array_filter(
 		$results,
 		function ( $item ) {
 			return 0 < strlen( trim( $item ) );
 		}
 	);

 	// Return the results
 	return $results;
 }

 /**
 * Split a part into smaller parts that are as close as possible to but not greater than the specified length.
 *
 * @param string $part    The part to be split.
 * @param int    $limit   The maximum length of each resulting string.
 * @param array  $results The results array to be populated.
 *
 * @return array The results array populated with the split part.
 * @noinspection PhpUnused
 */
 function split_part( string $part = '', int $limit = 3200, array $results = [] ): array {
 	// do we have spaces in the part body?
 	if ( 0 < intval( strpos( $part, ' ' ) ) ) {
 		// does the part have distinct sentences?
 		if ( 0 < intval( preg_match_all( "/(?<=(?<!\d|\.)[.!?]) +/m", $part ) ) ) {
 			// split the part on sentence endings if it exceeds the limit (handle by sentence)
 			$items = preg_split( '/(?<=(?<!\d|\.)[.!?]) +/', $part, - 1, PREG_SPLIT_NO_EMPTY );
 		} else {
 			// no distinct sentences, so split on spaces (handle by word)
 			$items = array_map( 'trim', explode( ' ', $part ) );
 			// filter out items with length of zero
 			$items = array_filter(
 				$items,
 				function ( $item ) {
 					return 0 < strlen( $item );
 				}
 			);
 		}
 		// resize the block parts to be close but under the limit and add to the results using a space as glue
 		$results = array_merge( $results, resize_text_parts_under_limit( $items, $limit, ' ' ) );
 	} else {
 		// no spaces, so split by limit (handle by character limit)
 		$items = str_split( $part, $limit );
 		// add the items to the results
 		$results = array_merge( $results, $items );
 	}

 	return $results;
 }

 /**
 * Chops a raw document text into an array of strings, where each string is as close as possible to but not greater than the specified length.
 *
 * @param string $text  The raw document text to be chopped into parts with length close to but not greater than the limit.
 * @param int    $limit The maximum length of each resulting string.
 *
 * @return array|string[] An array of strings, where each string has a maximum length of $limit characters.
 * @noinspection PhpUnused
 */
 function chop_text_under_limit( string $text = '', int $limit = 3200 ): array {

 	// make sure the text is not empty
 	if ( 0 === strlen( $text = trim( $text ) ) ) {
 		return [];
 	}

 	// setup
 	$double_break        = "\n\n";
 	$double_break_length = strlen( $double_break );

 	// make sure the text does not start with any of our block separators or empty spaces
 	$text = ltrim( $text, '.!,;: ' );

 	// make sure ellipsis (…) is replaced with ...
 	$text = str_replace( '…', '...', $text );

 	// make sure a sequence of dots/underscores more than 3 is replaced with 3 dots (save tokens)
 	$text = preg_replace( '/\.{4,}|_{4,}/', '...', $text );

 	// make sure a sequence of dashes more than 3 is replaced with 3 dashes (save tokens)
 	$text = preg_replace( '/-{4,}/', '---', $text );

 	// clean the text before chopping
 	$text = clean_text_string( $text );

 	// return the text in an array if it is shorter than the limit
 	if ( strlen( $text ) <= $limit ) {
 		return [ $text ];
 	}

 	// make sure to add a space at the end of the text (for regex)
 	$text .= ' ';

 	// prepare the results container
 	$results = [];

 	// make sure the text does not start with any of our block separators, and an empty space is added at the end (for regex)
 	$text = ltrim( $text, '.!,;: ' ) . ' ';

 	// try to split at line breaks after the block separators to get the paragraphs entirely (do we have ready blocks?)
 	if ( 0 < intval( preg_match_all( "/(?<=(?<!\d|\.)[.!?])\n/m", $text ) ) ) {
 		// we do, so split the text into blocks
 		$parts = preg_split( "/(?<=(?<!\d|\.)[.!?])\n/m", $text, - 1, PREG_SPLIT_NO_EMPTY );

 		// sanitize each block and populate the results
 		array_walk(
 			$parts,
 			function ( $part ) use ( &$results, $limit, $double_break, $double_break_length ) {

 				// remove leading and trailing spaces from the part
 				$part = trim( $part );

 				// get part length
 				$part_length = strlen( $part );
 				// get part length with double wrapper around it
 				$prefixed_part_length = $part_length + $double_break_length;

 				// if we are under the limit, just append the part to the results
 				if ( $prefixed_part_length <= $limit ) {
 					// add the part as the last item in results
 					$results[] = $part;
 					// add the last element to the previous element in the results if it fits within the limit
 					$results = resize_text_parts_under_limit( $results, $limit, $double_break );
 				} elseif ( $part_length <= $limit ) {
 					// the part alone fits in the limit, so add it to the results as is
 					$results[] = $part;
 				} else {
 					// the part is too long to fit in the limit, so we need to split it
 					$results = split_part( $part, $limit, $results );
 				}
 			}
 		);
 	} else {
 		// no ready to use blocks, so split part
 		$results = split_part( $text, $limit, $results );
 	}

 	// normalize results items
 	array_walk( $results, function ( &$result ) {
 		// trim the result
 		$result = trim( $result );
 		// replace double spaces with single spaces
 		$result = preg_replace( '/ {2,}/', ' ', $result );
 		// replace sequences of 2 or more new lines with 2 new lines
 		$result = preg_replace( '/\n{2,}/', "\n\n", $result );
 	} );

 	// remove empty results
 	/** @noinspection PhpUnnecessaryLocalVariableInspection */
 	$results = array_filter(
 		$results,
 		function ( $result ) {
 			return 0 < strlen( trim( $result ) );
 		}
 	);

 	// return the results
 	return $results;
 }
	<?php
	/**
	* Updated: 2023-08-25 15:58 CEST - fixed bugs in block recognition, improved logic.
	*/

	/**
	* Cleans a text string by normalizing line breaks, removing empty lines, and normalizing spaces.
	*
	* @param string $text The text string to be cleaned.
	*
	* @return string The cleaned text string.
	* Note: The cleaned text string will have lines separated by \n (one new line).
	* @noinspection PhpUnused
	*/
	function clean_text_string( string $text = '' ): string {
	// decode html entities
	$text = html_entity_decode( $text, ENT_QUOTES \| ENT_HTML5, 'UTF-8' );

	// Normalize line breaks to \n (one new line)
	$text = str_replace( [ "\r\n", "\r" ], [ "\n", "\n" ], $text );

	// Split into lines
	$lines = explode( "\n", $text );

	// Remove empty lines
	$lines = array_filter( $lines, function ( $line ) {
	return 0 < strlen( trim( $line ) );
	} );

	// Clean lines separately
	array_walk(
	$lines,
	function ( string &$line ) {
	// Replace two or more spaces with a single space
	$line = preg_replace( '/ {2,}/', ' ', $line );
	// Remove leading and trailing spaces
	$line = trim( $line );
	}
	);

	// Remove empty lines again
	$lines = array_filter( $lines, function ( $line ) {
	return 0 < strlen( trim( $line ) );
	} );

	// Join lines and return
	return implode( "\n", $lines );
	}

	/**
	* Changes the length of text strings in an array to try to be as close as possible to the specified length.
	*
	* @param array $parts An array of text parts (strings) to be joined.
	* @param int $limit The maximum string length accepted (default: 1200).
	* @param string $glue The string to use as a glue when joining new strings (default: "\n\n").
	*
	* @return array A new array of strings, where each string is close to but not greater than the specified length.
	* @noinspection PhpUnused
	*/
	function resize_text_parts_under_limit( array $parts = [], int $limit = 3200, string $glue = "\n\n" ): array {
	// Initialize the results array
	$results = [];
	// Get the glue length
	$glue_length = strlen( $glue );

	// Loop through the parts
	foreach ( $parts as $part ) {
	// Get the current part index (last item in results array)
	$current_index = 0 >= ( $index = count( $results ) - 1 ) ? 0 : $index;
	// Get the current part
	$current = trim( empty( $results[ $current_index ] ) ? '' : $results[ $current_index ] );
	// Get the current part length
	$current_length = strlen( $current );
	// trim the part
	$part = trim( $part );
	// Get the part length
	$part_length = strlen( $part );
	// Check if adding the part to the current string will exceed the limit
	if ( $current_length + $part_length + $glue_length <= $limit ) {
	// We are under the limit, so append the part to the current string
	// If current is empty, just use the part; join with glue otherwise
	$results[ $current_index ] = trim( empty( $current_length ) ? $part : implode( $glue, array(
	$current,
	$part,
	) ) );
	} else {
	// We are over the limit, so add the part to the end of the results array
	$results[] = trim( $part );
	}
	}

	// Remove empty strings from the results
	/** @noinspection PhpUnnecessaryLocalVariableInspection */
	$results = array_filter(
	$results,
	function ( $item ) {
	return 0 < strlen( trim( $item ) );
	}
	);

	// Return the results
	return $results;
	}

	/**
	* Split a part into smaller parts that are as close as possible to but not greater than the specified length.
	*
	* @param string $part The part to be split.
	* @param int $limit The maximum length of each resulting string.
	* @param array $results The results array to be populated.
	*
	* @return array The results array populated with the split part.
	* @noinspection PhpUnused
	*/
	function split_part( string $part = '', int $limit = 3200, array $results = [] ): array {
	// do we have spaces in the part body?
	if ( 0 < intval( strpos( $part, ' ' ) ) ) {
	// does the part have distinct sentences?
	if ( 0 < intval( preg_match_all( "/(?<=(?<!\d\|\.)[.!?]) +/m", $part ) ) ) {
	// split the part on sentence endings if it exceeds the limit (handle by sentence)
	$items = preg_split( '/(?<=(?<!\d\|\.)[.!?]) +/', $part, - 1, PREG_SPLIT_NO_EMPTY );
	} else {
	// no distinct sentences, so split on spaces (handle by word)
	$items = array_map( 'trim', explode( ' ', $part ) );
	// filter out items with length of zero
	$items = array_filter(
	$items,
	function ( $item ) {
	return 0 < strlen( $item );
	}
	);
	}
	// resize the block parts to be close but under the limit and add to the results using a space as glue
	$results = array_merge( $results, resize_text_parts_under_limit( $items, $limit, ' ' ) );
	} else {
	// no spaces, so split by limit (handle by character limit)
	$items = str_split( $part, $limit );
	// add the items to the results
	$results = array_merge( $results, $items );
	}

	return $results;
	}

	/**
	* Chops a raw document text into an array of strings, where each string is as close as possible to but not greater than the specified length.
	*
	* @param string $text The raw document text to be chopped into parts with length close to but not greater than the limit.
	* @param int $limit The maximum length of each resulting string.
	*
	* @return array\|string[] An array of strings, where each string has a maximum length of $limit characters.
	* @noinspection PhpUnused
	*/
	function chop_text_under_limit( string $text = '', int $limit = 3200 ): array {

	// make sure the text is not empty
	if ( 0 === strlen( $text = trim( $text ) ) ) {
	return [];
	}

	// setup
	$double_break = "\n\n";
	$double_break_length = strlen( $double_break );

	// make sure the text does not start with any of our block separators or empty spaces
	$text = ltrim( $text, '.!,;: ' );

	// make sure ellipsis (…) is replaced with ...
	$text = str_replace( '…', '...', $text );

	// make sure a sequence of dots/underscores more than 3 is replaced with 3 dots (save tokens)
	$text = preg_replace( '/\.{4,}\|_{4,}/', '...', $text );

	// make sure a sequence of dashes more than 3 is replaced with 3 dashes (save tokens)
	$text = preg_replace( '/-{4,}/', '---', $text );

	// clean the text before chopping
	$text = clean_text_string( $text );

	// return the text in an array if it is shorter than the limit
	if ( strlen( $text ) <= $limit ) {
	return [ $text ];
	}

	// make sure to add a space at the end of the text (for regex)
	$text .= ' ';

	// prepare the results container
	$results = [];

	// make sure the text does not start with any of our block separators, and an empty space is added at the end (for regex)
	$text = ltrim( $text, '.!,;: ' ) . ' ';

	// try to split at line breaks after the block separators to get the paragraphs entirely (do we have ready blocks?)
	if ( 0 < intval( preg_match_all( "/(?<=(?<!\d\|\.)[.!?])\n/m", $text ) ) ) {
	// we do, so split the text into blocks
	$parts = preg_split( "/(?<=(?<!\d\|\.)[.!?])\n/m", $text, - 1, PREG_SPLIT_NO_EMPTY );

	// sanitize each block and populate the results
	array_walk(
	$parts,
	function ( $part ) use ( &$results, $limit, $double_break, $double_break_length ) {

	// remove leading and trailing spaces from the part
	$part = trim( $part );

	// get part length
	$part_length = strlen( $part );
	// get part length with double wrapper around it
	$prefixed_part_length = $part_length + $double_break_length;

	// if we are under the limit, just append the part to the results
	if ( $prefixed_part_length <= $limit ) {
	// add the part as the last item in results
	$results[] = $part;
	// add the last element to the previous element in the results if it fits within the limit
	$results = resize_text_parts_under_limit( $results, $limit, $double_break );
	} elseif ( $part_length <= $limit ) {
	// the part alone fits in the limit, so add it to the results as is
	$results[] = $part;
	} else {
	// the part is too long to fit in the limit, so we need to split it
	$results = split_part( $part, $limit, $results );
	}
	}
	);
	} else {
	// no ready to use blocks, so split part
	$results = split_part( $text, $limit, $results );
	}

	// normalize results items
	array_walk( $results, function ( &$result ) {
	// trim the result
	$result = trim( $result );
	// replace double spaces with single spaces
	$result = preg_replace( '/ {2,}/', ' ', $result );
	// replace sequences of 2 or more new lines with 2 new lines
	$result = preg_replace( '/\n{2,}/', "\n\n", $result );
	} );

	// remove empty results
	/** @noinspection PhpUnnecessaryLocalVariableInspection */
	$results = array_filter(
	$results,
	function ( $result ) {
	return 0 < strlen( trim( $result ) );
	}
	);

	// return the results
	return $results;
	}