Created
May 6, 2019 14:21
-
-
Save vijinho/2efab98cb016a0a08907f4ba0331fad7 to your computer and use it in GitHub Desktop.
automatically rename PDF files found as: year - title - author(s)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* get-rename.php - get metadata from files readable with exiftool | |
* and presents options for renaming | |
* creates cache files, metadata.ser | |
* | |
* https://www.sno.phy.queensu.ca/~phil/exiftool/#filename | |
* see https://www.sno.phy.queensu.ca/~phil/exiftool/exiftool_pod.html | |
* | |
* relies on command-line tools, tested on MacOS. | |
* | |
* @license GPLv3 (http://www.gnu.org/licenses/gpl-3.0.html) | |
*/ | |
define('DEBUG',1); | |
define('VERBOSE',1); | |
$auto_backup = false; | |
$import_folder = '000000'; // import files folder | |
$move_dupes = '/Users/vijay/MEGA/.debris/' . join('-', [date('Y'), date('m'), date('d')]); // folder to move dupes to | |
@mkdir($move_dupes, 0777, true); | |
$force_refresh = false; // force refresh of file data | |
$refresh = false; // force refresh of post-processing | |
$refresh_checksum = false; // force refresh of checksum | |
$skim = true; // https://skim-app.sourceforge.io/ | |
$metadata_update = false; | |
date_default_timezone_set('UTC'); | |
ini_set('default_charset', 'utf-8'); | |
ini_set('mbstring.encoding_translation', 'On'); | |
ini_set('mbstring.func_overload', 6); | |
ini_set('auto_detect_line_endings', TRUE); | |
// check CLI commands are available | |
$commands = getCommands(); | |
if (empty($commands)) exit; | |
// load or create $data of file meta data | |
$data = []; | |
$data_file = '.metadata.ser'; | |
$data = serialize_load($data_file); | |
$errors = 0; | |
if (empty($force_refresh) && !empty($data)) { | |
verbose("Loaded data file: $data_file"); | |
} else { | |
verbose("Creating data file: $data_file"); | |
eval('$data = ' . `exiftool -php -r -G *`); | |
serialize_save($data_file, $data); | |
} | |
// check if already processed data | |
if (empty($refresh) && empty($data[0])) { | |
debug("Data already tidied from exiftool."); | |
} else { | |
debug("New data created."); | |
} | |
// load in list of files to check against optionally | |
$lines = @file('books.list'); | |
if (!empty($lines)) { | |
foreach ($lines as $i => $l) { | |
unset($lines[$i]); | |
$l = trim($l); | |
$lines[$l] = $l; | |
} | |
} | |
$date_fields = [ | |
'XMP:Date', 'XMP:CreateDate', 'XMP:CreationDate', 'XMP:MetadataDate', 'XMP:ModDate', 'XMP:ModifyDate', | |
'PDF:CreateDate' | |
]; | |
$renames = []; | |
foreach ($data as $i => $metadata) { | |
$s = $metadata["SourceFile"]; | |
if (!file_exists($s)) { | |
unset($data[$i]); | |
} | |
$ext = strtolower('.' . pathinfo($s, PATHINFO_EXTENSION)); | |
if (empty($liens) || (!empty($lines) && array_key_exists($s, $lines))) { | |
$year = (int) substr($metadata['File:FileName'], 0, 4); | |
if ($year < 1000) { | |
$year = date('Y'); | |
} | |
$author = $authors = $title = ''; | |
if (!empty($metadata['PDF:Author'])) { | |
$authors = $metadata['PDF:Author']; | |
} else if (!empty($metadata['XMP:Author'])) { | |
$authors = $metadata['XMP:Author']; | |
} | |
if (!empty($authors)) { | |
$authors = preg_split('/&/', $authors); | |
sort($authors); | |
foreach ($authors as $i => $author) { | |
$authors[$i] = trim_author($author); | |
} | |
if (count($authors) > 3) { | |
$authors = array_slice($authors, 0, 2); | |
$authors[] = 'et al'; | |
} | |
$author = trim(join(' & ', $authors)); | |
} | |
if (!empty($metadata['PDF:Title'])) { | |
$title = $metadata['PDF:Title']; | |
} else if (!empty($metadata['XMP:Title'])) { | |
$title = $metadata['XMP:Title']; | |
} | |
$title = trim_title($title); | |
foreach ($date_fields as $field) { | |
if (array_key_exists($field, $metadata)) { | |
$y = substr($metadata[$field], 0, 4); | |
if ($y < $year && $y > 1000) { | |
$year = $y; | |
} | |
} | |
} | |
if (empty($author) || empty($title) || empty($year) || empty($title)) { | |
//verbose("Missing mandatory data, skippping."); | |
//print_r($metadata); | |
continue; | |
} | |
if (!empty($title)) { | |
$f = sprintf("%d - %s - %s%s", $year, $title, $author, $ext); | |
//printf("Filename:\t%s\nNew File:\t%s\n", $s, $f); | |
if ($s !== $f) | |
$renames[$metadata['File:Directory'] . '/' . $s] = $metadata['File:Directory'] . '/' . $f; | |
//print_r($metadata); exit; | |
printf("\nFilename:\t%s\nAuthor:\t\t%s\nTitle:\t\t%s\nDate:\t\t%s\nNew:\t\t%s\n", | |
basename($s), $author, $title, $year, $f); | |
} | |
} | |
} | |
if (!empty($renames)) { | |
foreach ($renames as $s => $d) { | |
$newfile = $d; | |
retry_copy: | |
verbose(sprintf("\nCopying:\n\t%s\n\t%s\n", $s, $newfile)); | |
if (file_exists($newfile)) { | |
debug("\nFile exists: $newfile\n"); | |
$newfile = time() . '__' . $d; | |
sleep(1); | |
goto retry_copy; | |
} | |
if (!empty($auto_backup)) { | |
$backup = $move_dupes . '/' . basename($s); | |
verbose(sprintf("\nBacking-up:\n\t%s\n\t%s\n", $s, $backup)); | |
if (!copy($s, $backup)) { | |
verbose(sprintf("\nFailed to copy file:\n\t%s\n\t", $s, $backup)); | |
exit; | |
} | |
} | |
if (!rename($s, $newfile)) { | |
unlink($backup); | |
} | |
} | |
} | |
//print_r($data); | |
exit; | |
function trim_author($a) { | |
$a = ucwords(trim($a)); | |
$names = preg_split('/[,\(\)]/', $a); | |
if (!empty($names)) { | |
$names = array_reverse($names); | |
$a = join(' ', $names); | |
} | |
$names = preg_split('[\s]', $a); | |
if (count($names) > 1) { | |
$j = count($names) - 1; | |
for ($i = 0; $i < $j; $i++) { | |
if (empty(trim($names[$i])) || (int)($names[$i]) > 0) { | |
unset($names[$i]); | |
continue; | |
} | |
// print_r($names); | |
$names[$i] = trim($names[$i][0]); | |
} | |
} | |
$name = trim(join(' ', $names)); | |
return $name; | |
} | |
function trim_title($t, $maxlen = 90) { | |
if ('Untitled' == $t) { | |
return ''; | |
} | |
$s = [ | |
':' | |
]; | |
$r = [ | |
'-' | |
]; | |
$words = preg_split("/[^a-z0-9-'&]/i", str_ireplace($s, $r, $t)); | |
foreach ($words as $i => $word) { | |
$words[$i] = trim(str_replace(' ', ' ', $word)); | |
} | |
$title = trim(join(' ', $words)); | |
if (strlen($title) > $maxlen) { | |
$title = substr($title, 0, $maxlen - 3) . '___'; | |
} | |
return $title; | |
} | |
//----------------------------------------------------------------------------- | |
// functions used above | |
/** | |
* Execute a command and return streams as an array of | |
* stdin, stdout, stderr | |
* | |
* @param string $cmd command to execute | |
* @return array|false array $streams | boolean false if failure | |
* @see https://secure.php.net/manual/en/function.proc-open.php | |
*/ | |
function shell_execute($cmd) | |
{ | |
$process = proc_open( | |
$cmd, | |
[ | |
['pipe', 'r'], | |
['pipe', 'w'], | |
['pipe', 'w'] | |
], $pipes | |
); | |
if (is_resource($process)) { | |
$streams = []; | |
foreach ($pipes as $p => $v) { | |
$streams[] = stream_get_contents($pipes[$p]); | |
} | |
proc_close($process); | |
return [ | |
'stdin' => $streams[0], | |
'stdout' => $streams[1], | |
'stderr' => $streams[2] | |
]; | |
} | |
return false; | |
} | |
/** | |
* Execute a command and return output of stdout or throw exception of stderr | |
* | |
* @param string $cmd command to execute | |
* @param boolean $split split returned results? default on newline | |
* @param string $exp regular expression to preg_split to split on | |
* @return mixed string $stdout | Exception if failure | |
* @see shell_execute($cmd) | |
*/ | |
function cmd_execute($cmd, $split = true, $exp = "/\n/") | |
{ | |
$result = shell_execute($cmd); | |
if (!empty($result['stderr'])) { | |
throw new Exception($result['stderr']); | |
} | |
$data = $result['stdout']; | |
if (empty($split) || empty($exp) || empty($data)) { | |
return $data; | |
} | |
return preg_split($exp, $data); | |
} | |
// check required commands installed and get path | |
function getCommands() | |
{ | |
static $commands = []; // cli command paths | |
if (!empty($commands)) { | |
return $commands; | |
} | |
$requirements = [ | |
'exiftool' => 'https://sno.phy.queensu.ca/~phil/exiftool/', | |
'gs' => 'http://manpages.ubuntu.com/manpages/bionic/man1/gs.1.html', | |
'cp' => 'copy system command - cp', | |
'mv' => 'move system command - mv', | |
'find' => 'system find commmand' | |
]; | |
$errors = []; | |
foreach ($requirements as $tool => $description) { | |
$cmd = cmd_execute("which $tool"); | |
if (empty($cmd)) { | |
$errors[] = "Error: Missing requirement: $tool - " . $description; | |
} else { | |
$commands[$tool] = $cmd[0]; | |
} | |
} | |
if (!empty($errors)) { | |
debug(join("\n", $errors) . "\n"); | |
} | |
return $commands; | |
} | |
/** | |
* Return the memory used by the script, (current/peak) | |
* | |
* @return string memory used | |
*/ | |
function get_memory_used() | |
{ | |
return( | |
ceil(memory_get_usage() / 1024 / 1024) . '/' . | |
ceil(memory_get_peak_usage() / 1024 / 1024)); | |
} | |
/** | |
* Output string, to STDERR if available | |
* | |
* @param string { string to output | |
* @param boolean $STDERR write to stderr if it is available | |
*/ | |
function output($text, $STDERR = true) | |
{ | |
if (!empty($STDERR) && defined('STDERR')) { | |
fwrite(STDERR, $text); | |
} else { | |
echo $text; | |
} | |
} | |
/** | |
* Dump debug data if DEBUG constant is set | |
* | |
* @param optional string $string string to output | |
* @param optional mixed $data to dump | |
* @return boolean true if string output, false if not | |
*/ | |
function debug($string = '', $data = []) | |
{ | |
if (DEBUG) { | |
output(trim('[D ' . get_memory_used() . '] ' . $string) . "\n"); | |
if (!empty($data)) { | |
output(print_r($data, 1)); | |
} | |
return true; | |
} | |
return false; | |
} | |
/** | |
* Output string if VERBOSE constant is set | |
* | |
* @param string $string string to output | |
* @param optional mixed $data to dump | |
* @return boolean true if string output, false if not | |
*/ | |
function verbose($string, $data = []) | |
{ | |
if (VERBOSE && !empty($string)) { | |
output(trim('[V' . ((DEBUG) ? ' ' . get_memory_used() : '') . '] ' . $string) . "\n"); | |
if (!empty($data)) { | |
output(print_r($data, 1)); | |
} | |
return true; | |
} | |
return false; | |
} | |
/** | |
* Clear an array of empty values | |
* | |
* @param array $keys array keys to explicitly remove regardless | |
* @return array the trimmed down array | |
*/ | |
function array_clear($array, $keys = []) | |
{ | |
foreach ($array as $key => $value) { | |
if (is_array($value)) { | |
do { | |
$oldvalue = $value; | |
$value = array_clear($value, $keys); | |
} | |
while ($oldvalue !== $value); | |
$array[$key] = array_clear($value, $keys); | |
} | |
if (empty($value) && 0 !== $value) { | |
unset($array[$key]); | |
} | |
if (in_array($key, $keys, true)) { | |
unset($array[$key]); | |
} | |
} | |
return $array; | |
} | |
/** | |
* Encode array character encoding recursively | |
* | |
* @param mixed $data | |
* @param string $to_charset convert to encoding | |
* @param string $from_charset convert from encoding | |
* @return mixed | |
*/ | |
function to_charset($data, $to_charset = 'UTF-8', $from_charset = 'auto') | |
{ | |
if (is_numeric($data)) { | |
if (is_float($data)) { | |
return (float) $data; | |
} else { | |
return (int) $data; | |
} | |
} else if (is_string($data)) { | |
return mb_convert_encoding($data, $to_charset, $from_charset); | |
} else if (is_array($data)) { | |
foreach ($data as $key => $value) { | |
$data[$key] = to_charset($value, $to_charset, $from_charset); | |
} | |
} else if (is_object($data)) { | |
foreach ($data as $key => $value) { | |
$data->$key = to_charset($value, $to_charset, $from_charset); | |
} | |
} | |
return $data; | |
} | |
/** | |
* Load a serialized php data file and return it | |
* | |
* @param string $filename the json filename | |
* @return array $data | |
*/ | |
function serialize_load($file) | |
{ | |
if (file_exists($file)) { | |
$data = unserialize(file_get_contents($file)); | |
$data = array_clear($data); | |
} | |
return empty($data) ? [] : $data; | |
} | |
/** | |
* Save data array to a php serialized data | |
* | |
* @param string $filename the json filename | |
* @param array $data data to save | |
* @return boolean result | |
*/ | |
function serialize_save($file, $data) | |
{ | |
if (empty($data)) { | |
return 'No data to write to file.'; | |
} | |
$data = array_clear($data); | |
return file_put_contents($file, serialize($data)); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment