sergeliatko · June 13, 2023 23:45 · sergeliatko · May 30, 2023
diff --git a/text-pre-formatter.php b/text-pre-formatter.php
 <?php

 /**
 * Cleans a text string by normalizing line breaks, removing empty lines, and normalizing spaces.
 *
 * @param string $text The text string to be cleaned.
 *
 * @return string The cleaned text string.
 */
 function clean_text( string $text ): string {
 	// Normalize line breaks to \n\n (two new lines)
 	$text = str_replace( [ "\r\n", "\r" ], PHP_EOL . PHP_EOL, $text );
 	// Replace two or more spaces with a single space
 	$text = preg_replace( '/ {2,}/', ' ', $text );
 	// Remove leading spaces before removing trailing spaces
 	$text = preg_replace( '/^[ \t]+/m', '', $text );
 	// Remove trailing spaces before removing empty lines
 	$text = preg_replace( '/[ \t]+$/m', '', $text );
 	// Remove empty lines
 	/** @noinspection PhpUnnecessaryLocalVariableInspection */
 	$text = preg_replace( '/^\s+/m', '', $text );

 	return $text;
 }

 /**
 * Joins text parts into an array of strings, where each string has a maximum length of $limit characters.
 *
 * @param array  $parts An array of text parts to be joined.
 * @param int    $limit The maximum length of each resulting string.
 * @param string $glue  The separator used to join text parts (default: PHP_EOL).
 *
 * @return array An array of strings, where each string has a maximum length of $limit characters.
 */
 function join_text_chops( array $parts = [], int $limit = 1650, string $glue = PHP_EOL ): array {
 	// Initialize the results array
 	$results = [];
 	// Initialize the current string
 	$current = '';
 	// Loop through the parts
 	foreach ( $parts as $part ) {
 		// Remove leading and trailing spaces from the part
 		$part = trim( $part );
 		// Check if adding the part to the current string will exceed the limit
 		if ( strlen( $current . $glue . $part ) <= $limit ) {
 			// Append the part to the current string
 			$current .= ( empty( $current ) ? '' : $glue ) . $part;
 		} else {
 			// Add the current string to the results
 			$results[] = $current;
 			// Start a new string with the current part
 			$current = $part;
 		}
 	}
 	// Add the last string to the results
 	$results[] = $current;

 	return $results;
 }

 /**
 * Chops text into an array of strings, where each string has a maximum length of $limit characters.
 *
 * @param string $text  The raw document text to be chopped.
 * @param int    $limit The maximum length of each resulting string.
 *
 * @return array An array of strings, where each string has a maximum length of $limit characters.
 */
 function chop_raw_text( string $text, int $limit = 1650 ): array {
 	// Clean the text before chopping
 	$text = clean_text( $text );
 	// Return the text in an array if it is shorter than the limit
 	if ( strlen( $text ) <= $limit ) {
 		return array( $text );
 	}
 	// Prepare the results
 	$results = [];
 	// Make sure the text does not start with any of the separators, and an empty space is added at the end (for regex)
 	$text = ltrim( $text, '.!,;: ' ) . ' ';
 	// Make sure … (ellipsis) is replaced with ... (3 dots)
 	$text = str_replace( '…', '...', $text );
 	// Make sure a sequence of dots/underscores more than 3 is replaced with 3 dots
 	$text = preg_replace( '/\.{4,}|_{4,}/', '...', $text );
 	// Try to split at line breaks after the sentence separators (if they are not preceded by a digit or a dot)
 	if ( 0 < preg_match_all( '/(?<=(?<!\d|\.)[.!?;:])\s+$/m', $text ) ) {
 		$parts = preg_split( '/(?<=(?<!\d|\.)[.!?;:])\s+$/m', $text, null, PREG_SPLIT_NO_EMPTY );
 		// Populate results
 		array_walk( $parts, function ( $part ) use ( &$results, $limit ) {
 			// Remove leading and trailing spaces from the part
 			$part = trim( $part );
 			// Split the part on sentence endings if it exceeds the limit (hard crop of long text)
 			if ( strlen( $part ) > $limit ) {
 				// Split on sentence endings
 				$items = preg_split( '/(?<=(?<!\d|\.)[.!?]) +/', $part, null, PREG_SPLIT_NO_EMPTY );
 				// Stuff as many sentences as possible into each paragraph (get closer to the limit) joining with spaces
 				$results = array_merge( $results, join_text_chops( $items, $limit, ' ' ) );
 			} else {
 				// Simply add the part to the results as is
 				$results[] = $part;
 			}
 		} );
 	} else {
 		// Split on sentence endings as the text does not have any obvious line breaks to split on
 		$results = preg_split( '/(?<=(?<!\d|\.)[.!?]) +/', $text, - 1, PREG_SPLIT_NO_EMPTY );
 	}

 	return join_text_chops( $results, $limit );
 }


 // have fun
	<?php

	/**
	* Cleans a text string by normalizing line breaks, removing empty lines, and normalizing spaces.
	*
	* @param string $text The text string to be cleaned.
	*
	* @return string The cleaned text string.
	*/
	function clean_text( string $text ): string {
	// Normalize line breaks to \n\n (two new lines)
	$text = str_replace( [ "\r\n", "\r" ], PHP_EOL . PHP_EOL, $text );
	// Replace two or more spaces with a single space
	$text = preg_replace( '/ {2,}/', ' ', $text );
	// Remove leading spaces before removing trailing spaces
	$text = preg_replace( '/^[ \t]+/m', '', $text );
	// Remove trailing spaces before removing empty lines
	$text = preg_replace( '/[ \t]+$/m', '', $text );
	// Remove empty lines
	/** @noinspection PhpUnnecessaryLocalVariableInspection */
	$text = preg_replace( '/^\s+/m', '', $text );

	return $text;
	}

	/**
	* Joins text parts into an array of strings, where each string has a maximum length of $limit characters.
	*
	* @param array $parts An array of text parts to be joined.
	* @param int $limit The maximum length of each resulting string.
	* @param string $glue The separator used to join text parts (default: PHP_EOL).
	*
	* @return array An array of strings, where each string has a maximum length of $limit characters.
	*/
	function join_text_chops( array $parts = [], int $limit = 1650, string $glue = PHP_EOL ): array {
	// Initialize the results array
	$results = [];
	// Initialize the current string
	$current = '';
	// Loop through the parts
	foreach ( $parts as $part ) {
	// Remove leading and trailing spaces from the part
	$part = trim( $part );
	// Check if adding the part to the current string will exceed the limit
	if ( strlen( $current . $glue . $part ) <= $limit ) {
	// Append the part to the current string
	$current .= ( empty( $current ) ? '' : $glue ) . $part;
	} else {
	// Add the current string to the results
	$results[] = $current;
	// Start a new string with the current part
	$current = $part;
	}
	}
	// Add the last string to the results
	$results[] = $current;

	return $results;
	}

	/**
	* Chops text into an array of strings, where each string has a maximum length of $limit characters.
	*
	* @param string $text The raw document text to be chopped.
	* @param int $limit The maximum length of each resulting string.
	*
	* @return array An array of strings, where each string has a maximum length of $limit characters.
	*/
	function chop_raw_text( string $text, int $limit = 1650 ): array {
	// Clean the text before chopping
	$text = clean_text( $text );
	// Return the text in an array if it is shorter than the limit
	if ( strlen( $text ) <= $limit ) {
	return array( $text );
	}
	// Prepare the results
	$results = [];
	// Make sure the text does not start with any of the separators, and an empty space is added at the end (for regex)
	$text = ltrim( $text, '.!,;: ' ) . ' ';
	// Make sure … (ellipsis) is replaced with ... (3 dots)
	$text = str_replace( '…', '...', $text );
	// Make sure a sequence of dots/underscores more than 3 is replaced with 3 dots
	$text = preg_replace( '/\.{4,}\|_{4,}/', '...', $text );
	// Try to split at line breaks after the sentence separators (if they are not preceded by a digit or a dot)
	if ( 0 < preg_match_all( '/(?<=(?<!\d\|\.)[.!?;:])\s+$/m', $text ) ) {
	$parts = preg_split( '/(?<=(?<!\d\|\.)[.!?;:])\s+$/m', $text, null, PREG_SPLIT_NO_EMPTY );
	// Populate results
	array_walk( $parts, function ( $part ) use ( &$results, $limit ) {
	// Remove leading and trailing spaces from the part
	$part = trim( $part );
	// Split the part on sentence endings if it exceeds the limit (hard crop of long text)
	if ( strlen( $part ) > $limit ) {
	// Split on sentence endings
	$items = preg_split( '/(?<=(?<!\d\|\.)[.!?]) +/', $part, null, PREG_SPLIT_NO_EMPTY );
	// Stuff as many sentences as possible into each paragraph (get closer to the limit) joining with spaces
	$results = array_merge( $results, join_text_chops( $items, $limit, ' ' ) );
	} else {
	// Simply add the part to the results as is
	$results[] = $part;
	}
	} );
	} else {
	// Split on sentence endings as the text does not have any obvious line breaks to split on
	$results = preg_split( '/(?<=(?<!\d\|\.)[.!?]) +/', $text, - 1, PREG_SPLIT_NO_EMPTY );
	}

	return join_text_chops( $results, $limit );
	}


	// have fun