Skip to content

Instantly share code, notes, and snippets.

@vijinho
Last active August 17, 2023 16:00
Show Gist options
  • Save vijinho/c0e6a7cba5c8887199935dc26eafb8d2 to your computer and use it in GitHub Desktop.
Save vijinho/c0e6a7cba5c8887199935dc26eafb8d2 to your computer and use it in GitHub Desktop.
use qpdf to linearize a bunch of pdf files
<?php
/**
* pdf-linearize.php - linearize PDFs
* WARNING: May run extremely slowly due to very large file sizes generated on pdf2ps
*
* relies on command-line tools, tested on ubuntu.
*
* @license GPLv3 (http://www.gnu.org/licenses/gpl-3.0.html)
*/
$folder = '.';
define('DEBUG',1);
define('VERBOSE',1);
date_default_timezone_set('UTC');
ini_set('default_charset', 'utf-8');
ini_set('mbstring.encoding_translation', 'On');
ini_set('mbstring.func_overload', 6);
ini_set('auto_detect_line_endings', TRUE);
// check CLI commands are available
$commands = getCommands();
if (empty($commands)) exit;
$ext = '.pdf';
$cmd = 'find '.$folder.' -type f -iname "*'.$ext.'" -print';
$files = cmd_execute($cmd);
$total = count($files);
$start = 99999;
foreach ($files as $k => $path) {
$total--;
unset($files[$k]);
$path = trim($path);
if (!file_exists($path)) {
continue;
}
if (0 == filesize($path)) {
unlink($path);
continue;
}
if ($total > $start) {
continue;
}
output("Checking [".count($files)."]: $path\n");
// get metadata and linearize if not
$cmd = $commands['exiftool'] . " -php -G " . escapeshellarg($path);
debug($cmd);
try {
$output = cmd_execute($cmd);
} catch (Exception $e) {
debug("Failed getting file data with exiftool.");
//debug(print_r($e,1));
}
eval('$metadata = ' . join("\n", $output));
if (!empty($metadata))
$metadata = $metadata[0];
if (array_key_exists('PDF:Linearized', $metadata)) {
$linearized = $metadata['PDF:Linearized'];
if ('Yes' == $linearized) {
verbose("Already linearized!");
continue;
}
}
var_dump($metadata);
// linearize pdf
$bak = $path . '.bak';
rename($path, $bak);
verbose("Linearizing...");
$cmd = sprintf("qpdf --linearize %s %s", escapeshellarg($bak), escapeshellarg($path));
debug($cmd);
try {
$output = cmd_execute($cmd);
} catch (Exception $e) {
rename($bak, $path);
debug(print_r($e,1));
continue;
}
debug($output);
if (filesize($path) > filesize($bak)) {
//cmd_execute(sprintf("%s %s %s", $commands['mv'], escapeshellarg($bak), escapeshellarg($path)));
//verbose("\tFilesize is not smaller!\n");
} else {
$savedb = filesize($bak) - filesize($path);
output("\tShrinking, saved: ". ceil($savedb/1024) . " Kbytes\n");
//unlink($bak);
}
unlink($bak);
}
exit;
//-----------------------------------------------------------------------------
// functions used above
/**
* Execute a command and return streams as an array of
* stdin, stdout, stderr
*
* @param string $cmd command to execute
* @return array|false array $streams | boolean false if failure
* @see https://secure.php.net/manual/en/function.proc-open.php
*/
function shell_execute($cmd)
{
$process = proc_open(
$cmd,
[
['pipe', 'r'],
['pipe', 'w'],
['pipe', 'w']
], $pipes
);
if (is_resource($process)) {
$streams = [];
foreach ($pipes as $p => $v) {
$streams[] = stream_get_contents($pipes[$p]);
}
proc_close($process);
return [
'stdin' => $streams[0],
'stdout' => $streams[1],
'stderr' => $streams[2]
];
}
return false;
}
/**
* Execute a command and return output of stdout or throw exception of stderr
*
* @param string $cmd command to execute
* @param boolean $split split returned results? default on newline
* @param string $exp regular expression to preg_split to split on
* @return mixed string $stdout | Exception if failure
* @see shell_execute($cmd)
*/
function cmd_execute($cmd, $split = true, $exp = "/\n/")
{
$result = shell_execute($cmd);
if (!empty($result['stderr'])) {
throw new Exception($result['stderr']);
}
$data = $result['stdout'];
if (empty($split) || empty($exp) || empty($data)) {
return $data;
}
return preg_split($exp, $data);
}
// check required commands installed and get path
function getCommands()
{
static $commands = []; // cli command paths
if (!empty($commands)) {
return $commands;
}
$requirements = [
'qpdf' => 'http://qpdf.sourceforge.net/',
'exiftool' => 'https://sno.phy.queensu.ca/~phil/exiftool/',
'cp' => 'copy system command - cp',
'mv' => 'move system command - mv',
'find' => 'system find commmand',
];
$errors = [];
foreach ($requirements as $tool => $description) {
$cmd = cmd_execute("which $tool");
if (empty($cmd)) {
$errors[] = "Error: Missing requirement: $tool - " . $description;
} else {
$commands[$tool] = $cmd[0];
}
}
if (!empty($errors)) {
echo join("\n", $errors) . "\n";
}
return $commands;
}
/**
* Return the memory used by the script, (current/peak)
*
* @return string memory used
*/
function get_memory_used()
{
return(
ceil(memory_get_usage() / 1024 / 1024) . '/' .
ceil(memory_get_peak_usage() / 1024 / 1024));
}
/**
* Output string, to STDERR if available
*
* @param string { string to output
* @param boolean $STDERR write to stderr if it is available
*/
function output($text, $STDERR = true)
{
if (!empty($STDERR) && defined('STDERR')) {
fwrite(STDERR, $text);
} else {
echo $text;
}
}
/**
* Dump debug data if DEBUG constant is set
*
* @param optional string $string string to output
* @param optional mixed $data to dump
* @return boolean true if string output, false if not
*/
function debug($string = '', $data = [])
{
if (DEBUG) {
output(trim('[D ' . get_memory_used() . '] ' . $string) . "\n");
if (!empty($data)) {
output(print_r($data, 1));
}
return true;
}
return false;
}
/**
* Output string if VERBOSE constant is set
*
* @param string $string string to output
* @param optional mixed $data to dump
* @return boolean true if string output, false if not
*/
function verbose($string, $data = [])
{
if (VERBOSE && !empty($string)) {
output(trim('[V' . ((DEBUG) ? ' ' . get_memory_used() : '') . '] ' . $string) . "\n");
if (!empty($data)) {
output(print_r($data, 1));
}
return true;
}
return false;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment