-
-
Save smalot/6183152 to your computer and use it in GitHub Desktop.
<?php | |
/** | |
* @file | |
* Class PdfParser | |
* | |
* @author : Sebastien MALOT <[email protected]> | |
* @date : 2013-08-08 | |
* | |
* References : | |
* - http://www.mactech.com/articles/mactech/Vol.15/15.09/PDFIntro/index.html | |
* - http://framework.zend.com/issues/secure/attachment/12512/Pdf.php | |
* - http://www.php.net/manual/en/ref.pdf.php#74211 | |
*/ | |
class PdfParser | |
{ | |
/** | |
* Parse PDF file | |
* | |
* @param string $filename | |
* @return string | |
*/ | |
public static function parseFile($filename) | |
{ | |
$content = file_get_contents($filename); | |
return self::extractText($content); | |
} | |
/** | |
* Parse PDF content | |
* | |
* @param string $content | |
* @return string | |
*/ | |
public static function parseContent($content) | |
{ | |
return self::extractText($content); | |
} | |
/** | |
* Convert a PDF into text. | |
* | |
* @param string $filename The filename to extract the data from. | |
* @return string The extracted text from the PDF | |
*/ | |
protected static function extractText($data) | |
{ | |
/** | |
* Split apart the PDF document into sections. We will address each | |
* section separately. | |
*/ | |
$a_obj = self::getDataArray($data, 'obj', 'endobj'); | |
$j = 0; | |
$a_chunks = array(); | |
/** | |
* Attempt to extract each part of the PDF document into a 'filter' | |
* element and a 'data' element. This can then be used to decode the | |
* data. | |
*/ | |
foreach ($a_obj as $obj) { | |
$a_filter = self::getDataArray($obj, '<<', '>>'); | |
if (is_array($a_filter) && isset($a_filter[0])) { | |
$a_chunks[$j]['filter'] = $a_filter[0]; | |
$a_data = self::getDataArray($obj, 'stream', 'endstream'); | |
if (is_array($a_data) && isset($a_data[0])) { | |
$a_chunks[$j]['data'] = trim(substr($a_data[0], strlen('stream'), strlen($a_data[0]) - strlen('stream') - strlen('endstream'))); | |
} | |
$j++; | |
} | |
} | |
$result_data = null; | |
// decode the chunks | |
foreach ($a_chunks as $chunk) { | |
// Look at each chunk decide if we can decode it by looking at the contents of the filter | |
if (isset($chunk['data'])) { | |
// look at the filter to find out which encoding has been used | |
if (strpos($chunk['filter'], 'FlateDecode') !== false) { | |
// Use gzuncompress but suppress error messages. | |
$data =@ gzuncompress($chunk['data']); | |
} else { | |
$data = $chunk['data']; | |
} | |
if (trim($data) != '') { | |
// If we got data then attempt to extract it. | |
$result_data .= ' ' . self::extractTextElements($data); | |
} | |
} | |
} | |
/** | |
* Make sure we don't have large blocks of white space before and after | |
* our string. Also extract alphanumerical information to reduce | |
* redundant data. | |
*/ | |
if (trim($result_data) == '') { | |
return null; | |
} else { | |
// Optimize hyphened words | |
$result_data = preg_replace('/\s*-[\r\n]+\s*/', '', $result_data); | |
$result_data = preg_replace('/\s+/', ' ', $result_data); | |
return $result_data; | |
} | |
} | |
protected static function extractTextElements($content) | |
{ | |
if (strpos($content, '/CIDInit') === 0) { | |
return ''; | |
} | |
$text = ''; | |
$lines = explode("\n", $content); | |
foreach ($lines as $line) { | |
$line = trim($line); | |
$matches = array(); | |
// Parse each lines to extract command and operator values | |
if (preg_match('/^(?<command>.*[\)\] ])(?<operator>[a-z]+[\*]?)$/i', $line, $matches)) { | |
$command = trim($matches['command']); | |
// Convert octal encoding | |
$found_octal_values = array(); | |
preg_match_all('/\\\\([0-9]{3})/', $command, $found_octal_values); | |
foreach($found_octal_values[0] as $value) { | |
$octal = substr($value, 1); | |
if (intval($octal) < 40) { | |
// Skips non printable chars | |
$command = str_replace($value, '', $command); | |
} else { | |
$command = str_replace($value, chr(octdec($octal)), $command); | |
} | |
} | |
// Removes encoded new lines, tabs, ... | |
$command = preg_replace('/\\\\[\r\n]/', '', $command); | |
$command = preg_replace('/\\\\[rnftb ]/', ' ', $command); | |
// Force UTF-8 charset | |
$encoding = mb_detect_encoding($command, array('ASCII', 'UTF-8', 'Windows-1252', 'ISO-8859-1')); | |
if (strtoupper($encoding) != 'UTF-8') { | |
if ($decoded = @iconv('CP1252', 'UTF-8//TRANSLIT//IGNORE', $command)) { | |
$command = $decoded; | |
} | |
} | |
// Removes leading spaces | |
$operator = trim($matches['operator']); | |
} else { | |
$command = $line; | |
$operator = ''; | |
} | |
// Handle main operators | |
switch ($operator) { | |
// Set character spacing. | |
case 'Tc': | |
break; | |
// Move text current point. | |
case 'Td': | |
$values = explode(' ', $command); | |
$y = array_pop($values); | |
$x = array_pop($values); | |
if ($x > 0) { | |
$text .= ' '; | |
} | |
if ($y < 0) { | |
$text .= ' '; | |
} | |
break; | |
// Move text current point and set leading. | |
case 'TD': | |
$values = explode(' ', $command); | |
$y = array_pop($values); | |
if ($y < 0) { | |
$text .= "\n"; | |
} | |
break; | |
// Set font name and size. | |
case 'Tf': | |
$text.= ' '; | |
break; | |
// Display text, allowing individual character positioning | |
case 'TJ': | |
$start = mb_strpos($command, '[', null, 'UTF-8') + 1; | |
$end = mb_strrpos($command, ']', null, 'UTF-8'); | |
$text.= self::parseTextCommand(mb_substr($command, $start, $end - $start, 'UTF-8')); | |
break; | |
// Display text. | |
case 'Tj': | |
$start = mb_strpos($command, '(', null, 'UTF-8') + 1; | |
$end = mb_strrpos($command, ')', null, 'UTF-8'); | |
$text.= mb_substr($command, $start, $end - $start, 'UTF-8'); // Removes round brackets | |
break; | |
// Set leading. | |
case 'TL': | |
// Set text matrix. | |
case 'Tm': | |
// $text.= ' '; | |
break; | |
// Set text rendering mode. | |
case 'Tr': | |
break; | |
// Set super/subscripting text rise. | |
case 'Ts': | |
break; | |
// Set text spacing. | |
case 'Tw': | |
break; | |
// Set horizontal scaling. | |
case 'Tz': | |
break; | |
// Move to start of next line. | |
case 'T*': | |
$text.= "\n"; | |
break; | |
// Internal use | |
case 'g': | |
case 'gs': | |
case 're': | |
case 'f': | |
// Begin text | |
case 'BT': | |
// End text | |
case 'ET': | |
break; | |
case '': | |
break; | |
default: | |
} | |
} | |
$text = str_replace(array('\\(', '\\)'), array('(', ')'), $text); | |
return $text; | |
} | |
/** | |
* Strip out the text from a small chunk of data. | |
* | |
* @param string $text | |
* @param int $font_size Currently not used | |
* | |
* @return string | |
*/ | |
protected static function parseTextCommand($text, $font_size = 0) { | |
$result = ''; | |
$cur_start_pos = 0; | |
while (($cur_start_text = mb_strpos($text, '(', $cur_start_pos, 'UTF-8')) !== false) { | |
// New text element found | |
if ($cur_start_text - $cur_start_pos > 8) { | |
$spacing = ' '; | |
} else { | |
$spacing_size = mb_substr($text, $cur_start_pos, $cur_start_text - $cur_start_pos, 'UTF-8'); | |
if ($spacing_size < -50) { | |
$spacing = ' '; | |
} else { | |
$spacing = ''; | |
} | |
} | |
$cur_start_text++; | |
$start_search_end = $cur_start_text; | |
while (($cur_start_pos = mb_strpos($text, ')', $start_search_end, 'UTF-8')) !== false) { | |
if (mb_substr($text, $cur_start_pos - 1, 1, 'UTF-8') != '\\') { | |
break; | |
} | |
$start_search_end = $cur_start_pos + 1; | |
} | |
// something wrong happened | |
if ($cur_start_pos === false) { | |
break; | |
} | |
// Add to result | |
$result .= $spacing . mb_substr($text, $cur_start_text, $cur_start_pos - $cur_start_text, 'UTF-8'); | |
$cur_start_pos++; | |
} | |
return $result; | |
} | |
/** | |
* Convert a section of data into an array, separated by the start and end words. | |
* | |
* @param string $data The data. | |
* @param string $start_word The start of each section of data. | |
* @param string $end_word The end of each section of data. | |
* @return array The array of data. | |
*/ | |
protected static function getDataArray($data, $start_word, $end_word) | |
{ | |
$start = 0; | |
$end = 0; | |
$a_results = array(); | |
while ($start !== false && $end !== false) { | |
$start = strpos($data, $start_word, $end); | |
$end = strpos($data, $end_word, $start); | |
if ($end !== false && $start !== false) { | |
// data is between start and end | |
$a_results[] = substr($data, $start, $end - $start + strlen($end_word)); | |
} | |
} | |
return $a_results; | |
} | |
} |
does not work at all
Hello, this class is working fine for some pdf. But I want to extract text from my LinkedIn resume PDF. In that case, it not working as expected.
Could anyone please help me regarding the same?
Hello, i need help on how to decode a pdf file in php to text.. i have tried the above mentioned libries but stil have this:
+DQGHOVUHJLVWHU�%�GHV $PWVJHULFKWV�)UDQNIXUW�DP 0DLQ $EWHLOXQJ�% :LHGHUJDEH�GHV�DNWXHOOHQ 5HJLVWHULQKDOWV 1XPPHU�GHU�)LUPD� 6HLWH���YRQ�� +DQGHOVUHJLVWHU�%�GHV $PWVJHULFKWV�)UDQNIXUW�DP 0DLQ $EWHLOXQJ�% :LHGHUJDEH�GHV�DNWXHOOHQ 5HJLVWHULQKDOWV 1XPPHU�GHU�)LUPD� 6HLWH���YRQ�� +DQGHOVUHJLVWHU�%�GHV $PWVJHULFKWV�)UDQNIXUW�DP 0DLQ $EWHLOXQJ�% :LHGHUJDEH�GHV�DNWXHOOHQ 5HJLVWHULQKDOWV 1XPPHU�GHU�)LUPD� 6HLWH���YRQ�� +DQGHOVUHJLVWHU�%�GHV $PWVJHULFKWV�)UDQNIXUW�DP 0DLQ $EWHLOXQJ�% :LHGHUJDEH�GHV�DNWXHOOHQ 5HJLVWHULQKDOWV 1XPPHU�GHU�)LUPD� 6HLWH���YRQ��
i also facing same issue in some pdf file which contains images
does not work with Cyrillic