Last active
June 13, 2023 23:45
-
-
Save sergeliatko/c396a18be27559dcc02c40bd7d954b2d to your computer and use it in GitHub Desktop.
Php code to preformat raw text and cut it into chunks before sending to OpenAI API
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* Cleans a text string by normalizing line breaks, removing empty lines, and normalizing spaces. | |
* | |
* @param string $text The text string to be cleaned. | |
* | |
* @return string The cleaned text string. | |
*/ | |
function clean_text( string $text ): string { | |
// Normalize line breaks to \n\n (two new lines) | |
$text = str_replace( [ "\r\n", "\r" ], PHP_EOL . PHP_EOL, $text ); | |
// Replace two or more spaces with a single space | |
$text = preg_replace( '/ {2,}/', ' ', $text ); | |
// Remove leading spaces before removing trailing spaces | |
$text = preg_replace( '/^[ \t]+/m', '', $text ); | |
// Remove trailing spaces before removing empty lines | |
$text = preg_replace( '/[ \t]+$/m', '', $text ); | |
// Remove empty lines | |
/** @noinspection PhpUnnecessaryLocalVariableInspection */ | |
$text = preg_replace( '/^\s+/m', '', $text ); | |
return $text; | |
} | |
/** | |
* Joins text parts into an array of strings, where each string has a maximum length of $limit characters. | |
* | |
* @param array $parts An array of text parts to be joined. | |
* @param int $limit The maximum length of each resulting string. | |
* @param string $glue The separator used to join text parts (default: PHP_EOL). | |
* | |
* @return array An array of strings, where each string has a maximum length of $limit characters. | |
*/ | |
function join_text_chops( array $parts = [], int $limit = 1650, string $glue = PHP_EOL ): array { | |
// Initialize the results array | |
$results = []; | |
// Initialize the current string | |
$current = ''; | |
// Loop through the parts | |
foreach ( $parts as $part ) { | |
// Remove leading and trailing spaces from the part | |
$part = trim( $part ); | |
// Check if adding the part to the current string will exceed the limit | |
if ( strlen( $current . $glue . $part ) <= $limit ) { | |
// Append the part to the current string | |
$current .= ( empty( $current ) ? '' : $glue ) . $part; | |
} else { | |
// Add the current string to the results | |
$results[] = $current; | |
// Start a new string with the current part | |
$current = $part; | |
} | |
} | |
// Add the last string to the results | |
$results[] = $current; | |
return $results; | |
} | |
/** | |
* Chops text into an array of strings, where each string has a maximum length of $limit characters. | |
* | |
* @param string $text The raw document text to be chopped. | |
* @param int $limit The maximum length of each resulting string. | |
* | |
* @return array An array of strings, where each string has a maximum length of $limit characters. | |
*/ | |
function chop_raw_text( string $text, int $limit = 1650 ): array { | |
// Clean the text before chopping | |
$text = clean_text( $text ); | |
// Return the text in an array if it is shorter than the limit | |
if ( strlen( $text ) <= $limit ) { | |
return array( $text ); | |
} | |
// Prepare the results | |
$results = []; | |
// Make sure the text does not start with any of the separators, and an empty space is added at the end (for regex) | |
$text = ltrim( $text, '.!,;: ' ) . ' '; | |
// Make sure … (ellipsis) is replaced with ... (3 dots) | |
$text = str_replace( '…', '...', $text ); | |
// Make sure a sequence of dots/underscores more than 3 is replaced with 3 dots | |
$text = preg_replace( '/\.{4,}|_{4,}/', '...', $text ); | |
// Try to split at line breaks after the sentence separators (if they are not preceded by a digit or a dot) | |
if ( 0 < preg_match_all( '/(?<=(?<!\d|\.)[.!?;:])\s+$/m', $text ) ) { | |
$parts = preg_split( '/(?<=(?<!\d|\.)[.!?;:])\s+$/m', $text, null, PREG_SPLIT_NO_EMPTY ); | |
// Populate results | |
array_walk( $parts, function ( $part ) use ( &$results, $limit ) { | |
// Remove leading and trailing spaces from the part | |
$part = trim( $part ); | |
// Split the part on sentence endings if it exceeds the limit (hard crop of long text) | |
if ( strlen( $part ) > $limit ) { | |
// Split on sentence endings | |
$items = preg_split( '/(?<=(?<!\d|\.)[.!?]) +/', $part, null, PREG_SPLIT_NO_EMPTY ); | |
// Stuff as many sentences as possible into each paragraph (get closer to the limit) joining with spaces | |
$results = array_merge( $results, join_text_chops( $items, $limit, ' ' ) ); | |
} else { | |
// Simply add the part to the results as is | |
$results[] = $part; | |
} | |
} ); | |
} else { | |
// Split on sentence endings as the text does not have any obvious line breaks to split on | |
$results = preg_split( '/(?<=(?<!\d|\.)[.!?]) +/', $text, - 1, PREG_SPLIT_NO_EMPTY ); | |
} | |
return join_text_chops( $results, $limit ); | |
} | |
// have fun |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Usage: