Skip to content

Instantly share code, notes, and snippets.

@sergeliatko
Last active August 25, 2023 13:59
Show Gist options
  • Save sergeliatko/0fc9925a4791d7afd8bd9b25545c4906 to your computer and use it in GitHub Desktop.
Save sergeliatko/0fc9925a4791d7afd8bd9b25545c4906 to your computer and use it in GitHub Desktop.
<?php
/**
* Updated: 2023-08-25 15:58 CEST - fixed bugs in block recognition, improved logic.
*/
/**
* Cleans a text string by normalizing line breaks, removing empty lines, and normalizing spaces.
*
* @param string $text The text string to be cleaned.
*
* @return string The cleaned text string.
* Note: The cleaned text string will have lines separated by \n (one new line).
* @noinspection PhpUnused
*/
function clean_text_string( string $text = '' ): string {
// decode html entities
$text = html_entity_decode( $text, ENT_QUOTES | ENT_HTML5, 'UTF-8' );
// Normalize line breaks to \n (one new line)
$text = str_replace( [ "\r\n", "\r" ], [ "\n", "\n" ], $text );
// Split into lines
$lines = explode( "\n", $text );
// Remove empty lines
$lines = array_filter( $lines, function ( $line ) {
return 0 < strlen( trim( $line ) );
} );
// Clean lines separately
array_walk(
$lines,
function ( string &$line ) {
// Replace two or more spaces with a single space
$line = preg_replace( '/ {2,}/', ' ', $line );
// Remove leading and trailing spaces
$line = trim( $line );
}
);
// Remove empty lines again
$lines = array_filter( $lines, function ( $line ) {
return 0 < strlen( trim( $line ) );
} );
// Join lines and return
return implode( "\n", $lines );
}
/**
* Changes the length of text strings in an array to try to be as close as possible to the specified length.
*
* @param array $parts An array of text parts (strings) to be joined.
* @param int $limit The maximum string length accepted (default: 1200).
* @param string $glue The string to use as a glue when joining new strings (default: "\n\n").
*
* @return array A new array of strings, where each string is close to but not greater than the specified length.
* @noinspection PhpUnused
*/
function resize_text_parts_under_limit( array $parts = [], int $limit = 3200, string $glue = "\n\n" ): array {
// Initialize the results array
$results = [];
// Get the glue length
$glue_length = strlen( $glue );
// Loop through the parts
foreach ( $parts as $part ) {
// Get the current part index (last item in results array)
$current_index = 0 >= ( $index = count( $results ) - 1 ) ? 0 : $index;
// Get the current part
$current = trim( empty( $results[ $current_index ] ) ? '' : $results[ $current_index ] );
// Get the current part length
$current_length = strlen( $current );
// trim the part
$part = trim( $part );
// Get the part length
$part_length = strlen( $part );
// Check if adding the part to the current string will exceed the limit
if ( $current_length + $part_length + $glue_length <= $limit ) {
// We are under the limit, so append the part to the current string
// If current is empty, just use the part; join with glue otherwise
$results[ $current_index ] = trim( empty( $current_length ) ? $part : implode( $glue, array(
$current,
$part,
) ) );
} else {
// We are over the limit, so add the part to the end of the results array
$results[] = trim( $part );
}
}
// Remove empty strings from the results
/** @noinspection PhpUnnecessaryLocalVariableInspection */
$results = array_filter(
$results,
function ( $item ) {
return 0 < strlen( trim( $item ) );
}
);
// Return the results
return $results;
}
/**
* Split a part into smaller parts that are as close as possible to but not greater than the specified length.
*
* @param string $part The part to be split.
* @param int $limit The maximum length of each resulting string.
* @param array $results The results array to be populated.
*
* @return array The results array populated with the split part.
* @noinspection PhpUnused
*/
function split_part( string $part = '', int $limit = 3200, array $results = [] ): array {
// do we have spaces in the part body?
if ( 0 < intval( strpos( $part, ' ' ) ) ) {
// does the part have distinct sentences?
if ( 0 < intval( preg_match_all( "/(?<=(?<!\d|\.)[.!?]) +/m", $part ) ) ) {
// split the part on sentence endings if it exceeds the limit (handle by sentence)
$items = preg_split( '/(?<=(?<!\d|\.)[.!?]) +/', $part, - 1, PREG_SPLIT_NO_EMPTY );
} else {
// no distinct sentences, so split on spaces (handle by word)
$items = array_map( 'trim', explode( ' ', $part ) );
// filter out items with length of zero
$items = array_filter(
$items,
function ( $item ) {
return 0 < strlen( $item );
}
);
}
// resize the block parts to be close but under the limit and add to the results using a space as glue
$results = array_merge( $results, resize_text_parts_under_limit( $items, $limit, ' ' ) );
} else {
// no spaces, so split by limit (handle by character limit)
$items = str_split( $part, $limit );
// add the items to the results
$results = array_merge( $results, $items );
}
return $results;
}
/**
* Chops a raw document text into an array of strings, where each string is as close as possible to but not greater than the specified length.
*
* @param string $text The raw document text to be chopped into parts with length close to but not greater than the limit.
* @param int $limit The maximum length of each resulting string.
*
* @return array|string[] An array of strings, where each string has a maximum length of $limit characters.
* @noinspection PhpUnused
*/
function chop_text_under_limit( string $text = '', int $limit = 3200 ): array {
// make sure the text is not empty
if ( 0 === strlen( $text = trim( $text ) ) ) {
return [];
}
// setup
$double_break = "\n\n";
$double_break_length = strlen( $double_break );
// make sure the text does not start with any of our block separators or empty spaces
$text = ltrim( $text, '.!,;: ' );
// make sure ellipsis (…) is replaced with ...
$text = str_replace( '…', '...', $text );
// make sure a sequence of dots/underscores more than 3 is replaced with 3 dots (save tokens)
$text = preg_replace( '/\.{4,}|_{4,}/', '...', $text );
// make sure a sequence of dashes more than 3 is replaced with 3 dashes (save tokens)
$text = preg_replace( '/-{4,}/', '---', $text );
// clean the text before chopping
$text = clean_text_string( $text );
// return the text in an array if it is shorter than the limit
if ( strlen( $text ) <= $limit ) {
return [ $text ];
}
// make sure to add a space at the end of the text (for regex)
$text .= ' ';
// prepare the results container
$results = [];
// make sure the text does not start with any of our block separators, and an empty space is added at the end (for regex)
$text = ltrim( $text, '.!,;: ' ) . ' ';
// try to split at line breaks after the block separators to get the paragraphs entirely (do we have ready blocks?)
if ( 0 < intval( preg_match_all( "/(?<=(?<!\d|\.)[.!?])\n/m", $text ) ) ) {
// we do, so split the text into blocks
$parts = preg_split( "/(?<=(?<!\d|\.)[.!?])\n/m", $text, - 1, PREG_SPLIT_NO_EMPTY );
// sanitize each block and populate the results
array_walk(
$parts,
function ( $part ) use ( &$results, $limit, $double_break, $double_break_length ) {
// remove leading and trailing spaces from the part
$part = trim( $part );
// get part length
$part_length = strlen( $part );
// get part length with double wrapper around it
$prefixed_part_length = $part_length + $double_break_length;
// if we are under the limit, just append the part to the results
if ( $prefixed_part_length <= $limit ) {
// add the part as the last item in results
$results[] = $part;
// add the last element to the previous element in the results if it fits within the limit
$results = resize_text_parts_under_limit( $results, $limit, $double_break );
} elseif ( $part_length <= $limit ) {
// the part alone fits in the limit, so add it to the results as is
$results[] = $part;
} else {
// the part is too long to fit in the limit, so we need to split it
$results = split_part( $part, $limit, $results );
}
}
);
} else {
// no ready to use blocks, so split part
$results = split_part( $text, $limit, $results );
}
// normalize results items
array_walk( $results, function ( &$result ) {
// trim the result
$result = trim( $result );
// replace double spaces with single spaces
$result = preg_replace( '/ {2,}/', ' ', $result );
// replace sequences of 2 or more new lines with 2 new lines
$result = preg_replace( '/\n{2,}/', "\n\n", $result );
} );
// remove empty results
/** @noinspection PhpUnnecessaryLocalVariableInspection */
$results = array_filter(
$results,
function ( $result ) {
return 0 < strlen( trim( $result ) );
}
);
// return the results
return $results;
}
@sergeliatko
Copy link
Author

sergeliatko commented Aug 22, 2023

usage:

$text_chunks = chop_text_under_limit( $text, 4200 );

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment