Created
August 8, 2013 10:15
-
-
Save smalot/6183449 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* Author : Zikko | |
* Source : http://www.zikko.se/resources/extractFromPDF.php | |
* | |
* Usage : | |
* | |
* $path = 'my_pdf_file.pdf'; | |
* $pdf = new pdf( $path ); | |
* $pages = $pdf->get_pages(); | |
* | |
* while (list($nr,$page) = each($pages)) { | |
* list($width,$height) = $page->get_dimensions(); | |
* $text = $page->get_text(); | |
* echo "Page $nr is $width x $height and the text is:\n$text\n\n"; | |
* } | |
* | |
*/ | |
define( 'PDF_PATTERN_SEPARATOR', ' \n\r\/\<\>()\[\]' ); | |
define( 'PDF_PATTERN_WHITESPACE', ' \n\r' ); | |
class pdf_readstream | |
{ | |
var $data; | |
var $offset; | |
var $size; | |
var $allow_references; | |
function pdf_readstream( &$data, $offset=0 ) | |
{ | |
$this->data = trim($data); | |
$this->offset = $offset; | |
$this->size = strlen($this->data); | |
} | |
function read_object() | |
{ | |
$this->skip_whitespace(); | |
#echo $this->offset."\n"; | |
switch( $this->get_next() ) | |
{ | |
case '0': | |
case '1': | |
case '2': | |
case '3': | |
case '4': | |
case '5': | |
case '6': | |
case '7': | |
case '8': | |
case '9': | |
case '.': | |
case '-': | |
case '+': | |
// number, object, reference | |
$number = $this->read_while("0123456789+-."); | |
if( $this->allow_references ) | |
{ | |
if( $this->expect(' 0 R') ) | |
{ | |
return $this->mark( new pdf_reference( $number.' 0' ) ); | |
} | |
else if( $this->expect(' 0 obj') ) | |
{ | |
$value = $this->read_object(); | |
$this->skip_whitespace(); | |
$this->offset += 6; | |
#$endobj = $this->read(6); | |
#if( $endobj != 'endobj' ) | |
#{ | |
# echo 'Unknown object data:'.$this->get_next(20)." at offset ".$this->offset."\n"; | |
# exit; | |
#} | |
#$value = new pdf_indirect_object( $value ); | |
if( is_object( $value ) ) | |
return $this->mark( $value ); | |
else | |
return $value; | |
} | |
} | |
return (float)$number; | |
case '(': | |
// string; | |
$this->offset++; | |
$value=''; | |
$level=1; | |
while(true) | |
{ | |
$next = $this->read(); | |
if($next=='(') | |
$level++; | |
else if($next==')') | |
{ | |
$level--; | |
if($level==0) | |
break; | |
} | |
else if($next=='\\') | |
{ | |
$next = $this->read(); | |
switch($next) | |
{ | |
case'n': $value.="\n"; break; | |
case'r': $value.="\r"; break; | |
case't': $value.="\t"; break; | |
case'b': $value.="\b"; break; | |
case'f': $value.="\f"; break; | |
case'(': $value.="("; break; | |
case')': $value.=")"; break; | |
case'\\': $value.="\\"; break; | |
default: | |
$next .= $this->read(2); | |
$value.=chr(octdec($next)); | |
break; | |
} | |
} | |
else | |
$value .= $next; | |
} | |
return $value; | |
case '/': | |
// name; | |
$this->offset++; | |
return $this->read_until(PDF_PATTERN_SEPARATOR); | |
case '[': | |
// array; | |
$this->offset++; | |
$value = array(); | |
while(true) | |
{ | |
$this->skip_whitespace(); | |
if( $this->get_next() == ']' ) break; | |
$value[] = $this->read_object(); | |
} | |
$this->offset++; | |
return $value; | |
case '%': | |
// comment; | |
$this->offset++; | |
return $this->read_until(PDF_PATTERN_WHITESPACE); | |
case '<': | |
if( ( $value = $this->read_dictionary() ) !== false ) | |
{ | |
$this->skip_whitespace(); | |
if( $this->expect('stream') ) | |
{ | |
$data_length = $value['Length']; | |
if( is_object( $data_length ) ) | |
$data_length = $data_length->resolve(); | |
$this->skip_whitespace(); | |
$data = $this->read($data_length); | |
$this->skip_whitespace(); | |
$this->offset += 9; | |
#$endstream = $this->read(9); | |
#if( $endstream != 'endstream' ) | |
#{ | |
# echo 'Unknown object data:'.$this->get_next(20)." at offset ".$this->offset."\n"; | |
# exit; | |
#} | |
return $this->mark( new pdf_stream( $value, $data ) ); | |
} | |
else | |
return $value; | |
} | |
else | |
{ | |
// hex string | |
$this->offset++; | |
$hex = $this->read_until('>'); | |
$value=pack("H*", $hex); | |
$this->offset++; | |
return $value; | |
} | |
break; | |
case 'f': | |
if( $this->expect('false') ) | |
return false; | |
break; | |
case 'n': | |
if( $this->expect('null') ) | |
return null; | |
break; | |
case 's': | |
if( $this->expect('startxref') ) | |
{ | |
$this->skip_whitespace(); | |
$value = $this->read_while("0123456789"); | |
return $this->mark( new pdf_startxref($value) ); | |
} | |
break; | |
case 't': | |
if( $this->expect('true') ) | |
return true; | |
else if( $this->expect('trailer') ) | |
{ | |
$value = $this->read_object(); | |
$value = $this->mark( new pdf_trailer($value) ); | |
} | |
break; | |
} | |
return new pdf_operator( $this->read_until(PDF_PATTERN_SEPARATOR) ); | |
} | |
function read_dictionary() | |
{ | |
if( $this->expect('<<') ) | |
{ | |
$value = array(); | |
while(true) | |
{ | |
$this->skip_whitespace(); | |
if( $this->expect('>>') ) break; | |
$value[ $this->read_object() ] = $this->read_object(); | |
} | |
return $value; | |
} | |
else | |
return false; | |
} | |
function mark( $child ) | |
{ | |
$child->parent = $this; | |
return $child; | |
} | |
function skip($count=1) | |
{ | |
$this->offset += $count; | |
} | |
function read($count=1) | |
{ | |
$v = substr( $this->data, $this->offset, $count ); | |
$this->offset += $count; | |
return $v; | |
} | |
function expect($str) | |
{ | |
$l = strlen($str); | |
if( substr($this->data,$this->offset,$l) == $str ) | |
{ | |
$this->offset += $l; | |
return true; | |
} | |
else | |
return false; | |
} | |
function get_next($count=1) | |
{ | |
return substr( $this->data, $this->offset, $count ); | |
} | |
function skip_whitespace() | |
{ | |
preg_match('/['.PDF_PATTERN_WHITESPACE.']*/', $this->data, $matches, 0, $this->offset ); | |
$this->offset += strlen($matches[0]); | |
} | |
function skip_until($chars) | |
{ | |
preg_match('/[^'.$chars.']*/', $this->data, $matches, 0, $this->offset ); | |
$this->offset += strlen($matches[0]); | |
} | |
function skip_while($chars) | |
{ | |
preg_match('/['.$chars.']*/', $this->data, $matches, 0, $this->offset ); | |
$this->offset += strlen($matches[0]); | |
} | |
function read_until($chars) | |
{ | |
preg_match('/[^'.$chars.']*/', $this->data, $matches, 0, $this->offset ); | |
$this->offset += strlen($matches[0]); | |
return $matches[0]; | |
} | |
function read_while($chars) | |
{ | |
preg_match('/['.$chars.']*/', $this->data, $matches, 0, $this->offset ); | |
$this->offset += strlen($matches[0]); | |
return $matches[0]; | |
} | |
function jump($offset) | |
{ | |
$this->offset = $offset; | |
} | |
function eof() | |
{ | |
return $this->offset >= strlen($this->data); | |
} | |
} | |
class pdf extends pdf_readstream | |
{ | |
var $catalog; | |
var $xref_table; | |
function pdf($filename) | |
{ | |
parent::pdf_readstream( file_get_contents($filename) ); | |
$this->xref_table = array(); | |
$this->objects_at_offsets = array(); | |
$this->allow_references = true; | |
$this->jump( strrpos( $this->data, 'startxref' ) ); | |
$this->expect('startxref'); | |
$offset = $this->read_object(); | |
$this->parse_xref( $offset ); | |
if(isset($this->catalog)) | |
$this->catalog = $this->catalog->resolve(); | |
} | |
function parse_xref($offset) | |
{ | |
$this->jump( $offset ); | |
$this->expect( 'xref' ); | |
while(true) | |
{ | |
$this->skip_whitespace(); | |
if( $this->expect('trailer') ) break; | |
$start = $this->read_while('0123456789'); | |
$this->skip_whitespace(); | |
$count = $this->read_while('0123456789'); | |
for($n=0;$n<$count;$n++) | |
{ | |
$number = $start + $n; | |
$this->skip_whitespace(); | |
$line = $this->read_while('0123456789 fn'); | |
list($offset,$generation,$type)=explode(' ',$line); | |
$generation = (int)$generation; | |
$this->xref_table[ $number.' '.$generation ] = (int)$offset; | |
} | |
} | |
$this->skip_whitespace(); | |
$trailer = $this->read_dictionary(); | |
if(isset($trailer['Root'])) | |
$this->catalog = $trailer['Root']; | |
if(isset($trailer['Prev'])) | |
$this->parse_xref( $trailer['Prev'] ); | |
} | |
function get_pages() | |
{ | |
$pages = array(); | |
$this->add_pages( $this->catalog['Pages']->resolve(), $pages ); | |
return $pages; | |
} | |
function add_pages( $array, &$pages ) | |
{ | |
$type = $array['Type']; | |
if($type=='Pages') | |
{ | |
$kids = $array['Kids']; | |
foreach($kids as $kid) | |
$this->add_pages( $kid->resolve(), $pages ); | |
} | |
else if($type=='Page') | |
{ | |
$pages[] = new pdf_page( $array ); | |
} | |
} | |
function get_dimensions( $array = false ) | |
{ | |
$pages = $this->get_pages(); | |
return $pages[0]->get_dimensions(); | |
} | |
function debug() | |
{ | |
return pdf_debug( $this->catalog ); | |
} | |
function resolve( $reference ) | |
{ | |
$old_offset = $this->offset; | |
$this->offset = $this->xref_table[ $reference->value ]; | |
$value = $this->read_object(); | |
$this->offset = $old_offset; | |
return $value; | |
} | |
} | |
class pdf_page | |
{ | |
var $props; | |
function pdf_page( $props ) | |
{ | |
$this->props = $props; | |
} | |
function get_dimensions() | |
{ | |
$mediabox = false; | |
$rotate = false; | |
$array = $this->props; | |
while(true) | |
{ | |
if($mediabox === false) | |
if( isset($array['MediaBox']) ) | |
$mediabox = $array['MediaBox']; | |
if($rotate === false) | |
if( isset($array['Rotate']) ) | |
$rotate = $array['Rotate']; | |
if( $mediabox !== false and $rotate !== false ) | |
break; | |
else if( isset($array['Parent']) ) | |
$array = $array['Parent']->resolve(); | |
else | |
break; | |
} | |
if($rotate===false) | |
$rotate=0; | |
list( $x1, $y1, $x2, $y2 ) = $mediabox; | |
$width = abs( $x1-$x2 ); | |
$height = abs( $y1-$y2 ); | |
if( ( $rotate % 180 ) == 0 ) | |
return array( $width, $height ); | |
else | |
return array( $height, $width ); | |
} | |
function get_content_stream() | |
{ | |
$contents = $this->props['Contents']; | |
if( is_array( $contents ) ) | |
{ | |
$content_data = ''; | |
foreach( $contents as $part ) | |
$content_data .= $part->resolve()->get_data(); | |
} | |
else | |
{ | |
$content_data = $contents->resolve()->get_data(); | |
} | |
return new pdf_content_stream( $content_data ); | |
} | |
function get_text() | |
{ | |
return $this->get_content_stream()->get_text(); | |
} | |
function debug() | |
{ | |
return pdf_debug( $this->props ); | |
} | |
} | |
class pdf_content_stream extends pdf_readstream | |
{ | |
var $operators; | |
function pdf_content_stream( &$data ) | |
{ | |
parent::pdf_readstream( $data ); | |
$this->allow_references = false; | |
$this->operators = array(); | |
$operands = array(); | |
$textarea=false; | |
while(!$this->eof()) | |
{ | |
$object = $this->read_object(); | |
if( is_object($object) ) | |
{ | |
#if( is_a($object,'operator') ) | |
{ | |
if($object->value == 'BT') | |
$textarea = true; | |
else if($object->value == 'ET') | |
$textarea = false; | |
if($textarea) | |
{ | |
$object->operands = $operands; | |
$this->operators[] = $object; | |
} | |
} | |
$operands = array(); | |
} | |
else | |
$operands[] = $object; | |
} | |
} | |
function get_text() | |
{ | |
$text=''; | |
reset($this->operators); | |
foreach($this->operators as $operator) | |
$text .= $operator->get_text(); | |
return $text; | |
} | |
function debug($level=0) | |
{ | |
$inset=str_repeat("\t",$level); | |
echo $inset."content_stream\n"; | |
echo $inset."(\n"; | |
reset($this->operators); | |
foreach($this->operators as $operator) | |
{ | |
echo $operator->debug($level+1); | |
} | |
echo $inset.")\n"; | |
} | |
} | |
function pdf_debug( $value, $level=0 ) | |
{ | |
$inset = str_repeat("\t",$level); | |
if( is_object( $value ) ) | |
{ | |
return $value->debug($level); | |
} | |
else if( is_array( $value ) ) | |
{ | |
$str=''; | |
$str.=$inset."Array\n"; | |
$str.=$inset."(\n"; | |
reset($value); | |
while(list($key,$v)=each($value)) | |
{ | |
if(is_object($v) or is_array($v)) | |
{ | |
$str.=$inset."\t".$key." =>\n"; | |
$str.=pdf_debug($v,$level+2); | |
} | |
else | |
{ | |
$str.=$inset."\t".$key." => ".pdf_debug($v); | |
} | |
} | |
$str.=$inset.")\n"; | |
return $str; | |
} | |
else if( is_bool( $value ) ) | |
{ | |
if($value) | |
return $inset."true\n"; | |
else | |
return $inset."false\n"; | |
} | |
else if( is_null( $value ) ) | |
{ | |
return $inset."NULL\n"; | |
} | |
else if( is_string( $value ) ) | |
{ | |
return $inset."\"$value\"\n"; | |
} | |
else | |
{ | |
return $inset.$value."\n"; | |
} | |
} | |
class pdf_object | |
{ | |
var $parent; | |
var $value; | |
function pdf_object($value) | |
{ | |
$this->value = $value; | |
} | |
function resolve() | |
{ | |
return $this; | |
} | |
function get_value() | |
{ | |
return $this->value; | |
} | |
function debug($level=0) | |
{ | |
$inset = str_repeat("\t",$level); | |
$str = $inset.get_class($this).' : '."\n"; | |
$str .= $inset."(\n"; | |
$str .= pdf_debug($this->value,$level+1); | |
$str .= $inset.")\n"; | |
return $str; | |
} | |
} | |
class pdf_reference extends pdf_object | |
{ | |
function resolve() | |
{ | |
return $this->parent->resolve($this); | |
} | |
} | |
class pdf_stream extends pdf_object | |
{ | |
var $data; | |
function pdf_stream($value,&$data) | |
{ | |
$this->value = $value; | |
$this->data = $data; | |
} | |
function get_data() | |
{ | |
$object = $this->resolve(); | |
$filter = $object->value['Filter']; | |
switch($filter) | |
{ | |
case false: | |
return $object->data; | |
case 'FlateDecode': | |
$data = @gzuncompress($object->data); | |
if(!$data) | |
{ | |
#file_put_contents('data.bin',$object->data); | |
return false; | |
} | |
return $data; | |
default: | |
return false; | |
} | |
} | |
function get_text() | |
{ | |
return $this->get_content_stream()->get_text(); | |
} | |
} | |
class pdf_operator extends pdf_object | |
{ | |
var $operands; | |
function debug($level=0) | |
{ | |
$inset=str_repeat("\t",$level); | |
echo $inset."operator( ".$this->value." )\n"; | |
if(count($this->operands)) | |
{ | |
echo $inset."(\n"; | |
reset($this->operands); | |
foreach($this->operands as $operand) | |
{ | |
echo pdf_debug($operand,$level+1); | |
} | |
echo $inset.")\n"; | |
} | |
} | |
function get_text() | |
{ | |
switch( $this->value ) | |
{ | |
case 'Tj': | |
return $this->operands[0]; | |
case '\'': | |
return "\n".$this->operands[0]; | |
case '"': | |
return "\n".$this->operands[2]; | |
case 'TJ': | |
$string=''; | |
$parts = $this->operands[0]; | |
foreach($parts as $part) | |
if(is_string($part)) | |
$string .= $part; | |
else | |
if($part < -150) | |
$string .= ' '; | |
return $string; | |
case 'Td': | |
case 'TD': | |
$delta_y = $this->operands[1]; | |
if($delta_y!=0) | |
return "\n"; | |
else | |
return ''; | |
case 'Tm': | |
$delta_y = $this->operands[5]; | |
if($delta_y!=0) | |
return "\n"; | |
else | |
return ''; | |
case 'T*': | |
return "\n"; | |
default: | |
return ''; | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment