Skip to content

Instantly share code, notes, and snippets.

@sergeliatko
Last active June 13, 2023 23:45
Show Gist options
  • Save sergeliatko/c396a18be27559dcc02c40bd7d954b2d to your computer and use it in GitHub Desktop.
Save sergeliatko/c396a18be27559dcc02c40bd7d954b2d to your computer and use it in GitHub Desktop.
Php code to preformat raw text and cut it into chunks before sending to OpenAI API
<?php
/**
* Cleans a text string by normalizing line breaks, removing empty lines, and normalizing spaces.
*
* @param string $text The text string to be cleaned.
*
* @return string The cleaned text string.
*/
function clean_text( string $text ): string {
// Normalize line breaks to \n\n (two new lines)
$text = str_replace( [ "\r\n", "\r" ], PHP_EOL . PHP_EOL, $text );
// Replace two or more spaces with a single space
$text = preg_replace( '/ {2,}/', ' ', $text );
// Remove leading spaces before removing trailing spaces
$text = preg_replace( '/^[ \t]+/m', '', $text );
// Remove trailing spaces before removing empty lines
$text = preg_replace( '/[ \t]+$/m', '', $text );
// Remove empty lines
/** @noinspection PhpUnnecessaryLocalVariableInspection */
$text = preg_replace( '/^\s+/m', '', $text );
return $text;
}
/**
* Joins text parts into an array of strings, where each string has a maximum length of $limit characters.
*
* @param array $parts An array of text parts to be joined.
* @param int $limit The maximum length of each resulting string.
* @param string $glue The separator used to join text parts (default: PHP_EOL).
*
* @return array An array of strings, where each string has a maximum length of $limit characters.
*/
function join_text_chops( array $parts = [], int $limit = 1650, string $glue = PHP_EOL ): array {
// Initialize the results array
$results = [];
// Initialize the current string
$current = '';
// Loop through the parts
foreach ( $parts as $part ) {
// Remove leading and trailing spaces from the part
$part = trim( $part );
// Check if adding the part to the current string will exceed the limit
if ( strlen( $current . $glue . $part ) <= $limit ) {
// Append the part to the current string
$current .= ( empty( $current ) ? '' : $glue ) . $part;
} else {
// Add the current string to the results
$results[] = $current;
// Start a new string with the current part
$current = $part;
}
}
// Add the last string to the results
$results[] = $current;
return $results;
}
/**
* Chops text into an array of strings, where each string has a maximum length of $limit characters.
*
* @param string $text The raw document text to be chopped.
* @param int $limit The maximum length of each resulting string.
*
* @return array An array of strings, where each string has a maximum length of $limit characters.
*/
function chop_raw_text( string $text, int $limit = 1650 ): array {
// Clean the text before chopping
$text = clean_text( $text );
// Return the text in an array if it is shorter than the limit
if ( strlen( $text ) <= $limit ) {
return array( $text );
}
// Prepare the results
$results = [];
// Make sure the text does not start with any of the separators, and an empty space is added at the end (for regex)
$text = ltrim( $text, '.!,;: ' ) . ' ';
// Make sure … (ellipsis) is replaced with ... (3 dots)
$text = str_replace( '…', '...', $text );
// Make sure a sequence of dots/underscores more than 3 is replaced with 3 dots
$text = preg_replace( '/\.{4,}|_{4,}/', '...', $text );
// Try to split at line breaks after the sentence separators (if they are not preceded by a digit or a dot)
if ( 0 < preg_match_all( '/(?<=(?<!\d|\.)[.!?;:])\s+$/m', $text ) ) {
$parts = preg_split( '/(?<=(?<!\d|\.)[.!?;:])\s+$/m', $text, null, PREG_SPLIT_NO_EMPTY );
// Populate results
array_walk( $parts, function ( $part ) use ( &$results, $limit ) {
// Remove leading and trailing spaces from the part
$part = trim( $part );
// Split the part on sentence endings if it exceeds the limit (hard crop of long text)
if ( strlen( $part ) > $limit ) {
// Split on sentence endings
$items = preg_split( '/(?<=(?<!\d|\.)[.!?]) +/', $part, null, PREG_SPLIT_NO_EMPTY );
// Stuff as many sentences as possible into each paragraph (get closer to the limit) joining with spaces
$results = array_merge( $results, join_text_chops( $items, $limit, ' ' ) );
} else {
// Simply add the part to the results as is
$results[] = $part;
}
} );
} else {
// Split on sentence endings as the text does not have any obvious line breaks to split on
$results = preg_split( '/(?<=(?<!\d|\.)[.!?]) +/', $text, - 1, PREG_SPLIT_NO_EMPTY );
}
return join_text_chops( $results, $limit );
}
// have fun
@sergeliatko
Copy link
Author

Usage:

<?php

$text = 'Your raw text goes here..... really long it be.";
// get pre-formatted text chunks of 2000 characters (or less) long
/** @var string[] $pieces */
$pieces = chop_raw_text( $text, 2000 );
// get your requests container
$requests = [];
// loop through pieces and send them to your model
foreach($pieces as $index => $piece ) {
    $requests[$index] = your_awesome_request_builder($piece);
}
// get API responses
$responses = your_awesome_client($requests);

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment