Skip to content

Instantly share code, notes, and snippets.

Created May 9, 2018 04:14
Show Gist options
  • Save nickrouty/6f5ed07e79d2223b279fc5e662264b10 to your computer and use it in GitHub Desktop.
Save nickrouty/6f5ed07e79d2223b279fc5e662264b10 to your computer and use it in GitHub Desktop.
Class for extraction the text from doc, docx, xlsx, pptx and wrapper for 3rd party pdf to text library.
* Class RD_Text_Extraction
* Example usage:
* $response = RD_Text_Extraction::convert_to_text($path_to_valid_file);
* For PDF text extraction, this class requires the Smalot\PdfParser\Parser class.
* @see
class RD_Text_Extraction
* @param $path_to_file
* @return string
* @throws Exception
protected static function pdf_to_text( $path_to_file ) {
if ( class_exists( '\\Smalot\\PdfParser\\Parser') ) {
$parser = new \Smalot\PdfParser\Parser();
$pdf = $parser->parseFile( $path_to_file );
$response = $pdf->getText();
} else {
throw new \Exception('The library used to parse PDFs was not found.' );
return $response;
* @param $path_to_file
* @return mixed|string
protected static function doc_to_text( $path_to_file )
$fileHandle = fopen($path_to_file, 'r');
$line = @fread($fileHandle, filesize($path_to_file));
$lines = explode(chr(0x0D), $line);
$response = '';
foreach ($lines as $current_line) {
$pos = strpos($current_line, chr(0x00));
if ( ($pos !== FALSE) || (strlen($current_line) == 0) ) {
} else {
$response .= $current_line . ' ';
$response = preg_replace('/[^a-zA-Z0-9\s\,\.\-\n\r\t@\/\_\(\)]/', '', $response);
return $response;
* @return bool|string
protected static function docx_to_text( $path_to_file )
$response = '';
$zip = zip_open($path_to_file);
if (!$zip || is_numeric($zip)) return false;
while ($zip_entry = zip_read($zip)) {
if (zip_entry_open($zip, $zip_entry) == FALSE)
if (zip_entry_name($zip_entry) != 'word/document.xml')
$response .= zip_entry_read($zip_entry, zip_entry_filesize($zip_entry));
$response = str_replace('</w:r></w:p></w:tc><w:tc>', ' ', $response);
$response = str_replace('</w:r></w:p>', "\r\n", $response);
$response = strip_tags($response);
return $response;
* @return string
protected static function xlsx_to_text( $path_to_file )
$xml_filename = 'xl/sharedStrings.xml'; //content file name
$zip_handle = new ZipArchive();
$response = '';
if (true === $zip_handle->open($path_to_file)) {
if (($xml_index = $zip_handle->locateName($xml_filename)) !== false) {
$doc = new DOMDocument();
$xml_data = $zip_handle->getFromIndex($xml_index);
$response = strip_tags($doc->saveXML());
return $response;
* @return string
protected static function pptx_to_text( $path_to_file )
$zip_handle = new ZipArchive();
$response = '';
if (true === $zip_handle->open($path_to_file)) {
$slide_number = 1; //loop through slide files
$doc = new DOMDocument();
while (($xml_index = $zip_handle->locateName('ppt/slides/slide' . $slide_number . '.xml')) !== false) {
$xml_data = $zip_handle->getFromIndex($xml_index);
$response .= strip_tags($doc->saveXML());
return $response;
* @return array
public static function get_valid_file_types()
return [
* @param $path_to_file
* @return bool|mixed|string
* @throws Exception
public static function convert_to_text( $path_to_file )
if (isset($path_to_file) && file_exists($path_to_file)) {
$valid_extensions = self::get_valid_file_types();
$file_info = pathinfo($path_to_file);
$file_ext = strtolower($file_info['extension']);
if (in_array( $file_ext, $valid_extensions )) {
$method = $file_ext . '_to_text';
$response = self::$method( $path_to_file );
} else {
throw new \Exception('Invalid file type provided. Valid file types are doc, docx, xlsx or pptx.');
} else {
throw new \Exception('Invalid file provided. The file does not exist.');
return $response;
Copy link

Thank you very much!

Copy link

thank you very much!

Copy link

MaheKarim commented Feb 12, 2025

Thank you It's working. Though , zip_open not work anymore.

protected static function doc_to_text($path_to_file){
        $fileHandle = fopen($path_to_file, 'r');
        $line = @fread($fileHandle, filesize($path_to_file));
        $lines = explode(chr(0x0D), $line);
        $response = '       ';
        foreach ($lines as $current_line) {
            $pos = strpos($current_line, chr(0x00));
            if (($pos !== FALSE) || (strlen($current_line) == 0)) {
            $response .= $current_line . '                   ';
        $response = preg_replace('/[^a-zA-Z0-9\s\,\.\-\n\r\t@\/\_\(\)]/', '', $response);
        return $response;

    protected static function docx_to_text($path_to_file)
        $response = '';
        $zip = new ZipArchive();

        if ($zip->open($path_to_file) !== true) {
            return false;

        // Read the document.xml file content
        $content = $zip->getFromName('word/document.xml');
        if ($content !== false) {
            $response = $content;


        // Clean up the XML content
        $response = str_replace('</w:r></w:p></w:tc><w:tc>', ' ', $response);
        $response = str_replace('</w:r></w:p>', "\r\n", $response);
        $response = strip_tags($response);

        return $response;
Here is my code from working codebase.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment