Skip to content

Instantly share code, notes, and snippets.

@vijinho
Created May 6, 2019 14:21
Show Gist options
  • Save vijinho/2efab98cb016a0a08907f4ba0331fad7 to your computer and use it in GitHub Desktop.
Save vijinho/2efab98cb016a0a08907f4ba0331fad7 to your computer and use it in GitHub Desktop.
automatically rename PDF files found as: year - title - author(s)
<?php
/**
* get-rename.php - get metadata from files readable with exiftool
* and presents options for renaming
* creates cache files, metadata.ser
*
* https://www.sno.phy.queensu.ca/~phil/exiftool/#filename
* see https://www.sno.phy.queensu.ca/~phil/exiftool/exiftool_pod.html
*
* relies on command-line tools, tested on MacOS.
*
* @license GPLv3 (http://www.gnu.org/licenses/gpl-3.0.html)
*/
define('DEBUG',1);
define('VERBOSE',1);
$auto_backup = false;
$import_folder = '000000'; // import files folder
$move_dupes = '/Users/vijay/MEGA/.debris/' . join('-', [date('Y'), date('m'), date('d')]); // folder to move dupes to
@mkdir($move_dupes, 0777, true);
$force_refresh = false; // force refresh of file data
$refresh = false; // force refresh of post-processing
$refresh_checksum = false; // force refresh of checksum
$skim = true; // https://skim-app.sourceforge.io/
$metadata_update = false;
date_default_timezone_set('UTC');
ini_set('default_charset', 'utf-8');
ini_set('mbstring.encoding_translation', 'On');
ini_set('mbstring.func_overload', 6);
ini_set('auto_detect_line_endings', TRUE);
// check CLI commands are available
$commands = getCommands();
if (empty($commands)) exit;
// load or create $data of file meta data
$data = [];
$data_file = '.metadata.ser';
$data = serialize_load($data_file);
$errors = 0;
if (empty($force_refresh) && !empty($data)) {
verbose("Loaded data file: $data_file");
} else {
verbose("Creating data file: $data_file");
eval('$data = ' . `exiftool -php -r -G *`);
serialize_save($data_file, $data);
}
// check if already processed data
if (empty($refresh) && empty($data[0])) {
debug("Data already tidied from exiftool.");
} else {
debug("New data created.");
}
// load in list of files to check against optionally
$lines = @file('books.list');
if (!empty($lines)) {
foreach ($lines as $i => $l) {
unset($lines[$i]);
$l = trim($l);
$lines[$l] = $l;
}
}
$date_fields = [
'XMP:Date', 'XMP:CreateDate', 'XMP:CreationDate', 'XMP:MetadataDate', 'XMP:ModDate', 'XMP:ModifyDate',
'PDF:CreateDate'
];
$renames = [];
foreach ($data as $i => $metadata) {
$s = $metadata["SourceFile"];
if (!file_exists($s)) {
unset($data[$i]);
}
$ext = strtolower('.' . pathinfo($s, PATHINFO_EXTENSION));
if (empty($liens) || (!empty($lines) && array_key_exists($s, $lines))) {
$year = (int) substr($metadata['File:FileName'], 0, 4);
if ($year < 1000) {
$year = date('Y');
}
$author = $authors = $title = '';
if (!empty($metadata['PDF:Author'])) {
$authors = $metadata['PDF:Author'];
} else if (!empty($metadata['XMP:Author'])) {
$authors = $metadata['XMP:Author'];
}
if (!empty($authors)) {
$authors = preg_split('/&/', $authors);
sort($authors);
foreach ($authors as $i => $author) {
$authors[$i] = trim_author($author);
}
if (count($authors) > 3) {
$authors = array_slice($authors, 0, 2);
$authors[] = 'et al';
}
$author = trim(join(' & ', $authors));
}
if (!empty($metadata['PDF:Title'])) {
$title = $metadata['PDF:Title'];
} else if (!empty($metadata['XMP:Title'])) {
$title = $metadata['XMP:Title'];
}
$title = trim_title($title);
foreach ($date_fields as $field) {
if (array_key_exists($field, $metadata)) {
$y = substr($metadata[$field], 0, 4);
if ($y < $year && $y > 1000) {
$year = $y;
}
}
}
if (empty($author) || empty($title) || empty($year) || empty($title)) {
//verbose("Missing mandatory data, skippping.");
//print_r($metadata);
continue;
}
if (!empty($title)) {
$f = sprintf("%d - %s - %s%s", $year, $title, $author, $ext);
//printf("Filename:\t%s\nNew File:\t%s\n", $s, $f);
if ($s !== $f)
$renames[$metadata['File:Directory'] . '/' . $s] = $metadata['File:Directory'] . '/' . $f;
//print_r($metadata); exit;
printf("\nFilename:\t%s\nAuthor:\t\t%s\nTitle:\t\t%s\nDate:\t\t%s\nNew:\t\t%s\n",
basename($s), $author, $title, $year, $f);
}
}
}
if (!empty($renames)) {
foreach ($renames as $s => $d) {
$newfile = $d;
retry_copy:
verbose(sprintf("\nCopying:\n\t%s\n\t%s\n", $s, $newfile));
if (file_exists($newfile)) {
debug("\nFile exists: $newfile\n");
$newfile = time() . '__' . $d;
sleep(1);
goto retry_copy;
}
if (!empty($auto_backup)) {
$backup = $move_dupes . '/' . basename($s);
verbose(sprintf("\nBacking-up:\n\t%s\n\t%s\n", $s, $backup));
if (!copy($s, $backup)) {
verbose(sprintf("\nFailed to copy file:\n\t%s\n\t", $s, $backup));
exit;
}
}
if (!rename($s, $newfile)) {
unlink($backup);
}
}
}
//print_r($data);
exit;
function trim_author($a) {
$a = ucwords(trim($a));
$names = preg_split('/[,\(\)]/', $a);
if (!empty($names)) {
$names = array_reverse($names);
$a = join(' ', $names);
}
$names = preg_split('[\s]', $a);
if (count($names) > 1) {
$j = count($names) - 1;
for ($i = 0; $i < $j; $i++) {
if (empty(trim($names[$i])) || (int)($names[$i]) > 0) {
unset($names[$i]);
continue;
}
// print_r($names);
$names[$i] = trim($names[$i][0]);
}
}
$name = trim(join(' ', $names));
return $name;
}
function trim_title($t, $maxlen = 90) {
if ('Untitled' == $t) {
return '';
}
$s = [
':'
];
$r = [
'-'
];
$words = preg_split("/[^a-z0-9-'&]/i", str_ireplace($s, $r, $t));
foreach ($words as $i => $word) {
$words[$i] = trim(str_replace(' ', ' ', $word));
}
$title = trim(join(' ', $words));
if (strlen($title) > $maxlen) {
$title = substr($title, 0, $maxlen - 3) . '___';
}
return $title;
}
//-----------------------------------------------------------------------------
// functions used above
/**
* Execute a command and return streams as an array of
* stdin, stdout, stderr
*
* @param string $cmd command to execute
* @return array|false array $streams | boolean false if failure
* @see https://secure.php.net/manual/en/function.proc-open.php
*/
function shell_execute($cmd)
{
$process = proc_open(
$cmd,
[
['pipe', 'r'],
['pipe', 'w'],
['pipe', 'w']
], $pipes
);
if (is_resource($process)) {
$streams = [];
foreach ($pipes as $p => $v) {
$streams[] = stream_get_contents($pipes[$p]);
}
proc_close($process);
return [
'stdin' => $streams[0],
'stdout' => $streams[1],
'stderr' => $streams[2]
];
}
return false;
}
/**
* Execute a command and return output of stdout or throw exception of stderr
*
* @param string $cmd command to execute
* @param boolean $split split returned results? default on newline
* @param string $exp regular expression to preg_split to split on
* @return mixed string $stdout | Exception if failure
* @see shell_execute($cmd)
*/
function cmd_execute($cmd, $split = true, $exp = "/\n/")
{
$result = shell_execute($cmd);
if (!empty($result['stderr'])) {
throw new Exception($result['stderr']);
}
$data = $result['stdout'];
if (empty($split) || empty($exp) || empty($data)) {
return $data;
}
return preg_split($exp, $data);
}
// check required commands installed and get path
function getCommands()
{
static $commands = []; // cli command paths
if (!empty($commands)) {
return $commands;
}
$requirements = [
'exiftool' => 'https://sno.phy.queensu.ca/~phil/exiftool/',
'gs' => 'http://manpages.ubuntu.com/manpages/bionic/man1/gs.1.html',
'cp' => 'copy system command - cp',
'mv' => 'move system command - mv',
'find' => 'system find commmand'
];
$errors = [];
foreach ($requirements as $tool => $description) {
$cmd = cmd_execute("which $tool");
if (empty($cmd)) {
$errors[] = "Error: Missing requirement: $tool - " . $description;
} else {
$commands[$tool] = $cmd[0];
}
}
if (!empty($errors)) {
debug(join("\n", $errors) . "\n");
}
return $commands;
}
/**
* Return the memory used by the script, (current/peak)
*
* @return string memory used
*/
function get_memory_used()
{
return(
ceil(memory_get_usage() / 1024 / 1024) . '/' .
ceil(memory_get_peak_usage() / 1024 / 1024));
}
/**
* Output string, to STDERR if available
*
* @param string { string to output
* @param boolean $STDERR write to stderr if it is available
*/
function output($text, $STDERR = true)
{
if (!empty($STDERR) && defined('STDERR')) {
fwrite(STDERR, $text);
} else {
echo $text;
}
}
/**
* Dump debug data if DEBUG constant is set
*
* @param optional string $string string to output
* @param optional mixed $data to dump
* @return boolean true if string output, false if not
*/
function debug($string = '', $data = [])
{
if (DEBUG) {
output(trim('[D ' . get_memory_used() . '] ' . $string) . "\n");
if (!empty($data)) {
output(print_r($data, 1));
}
return true;
}
return false;
}
/**
* Output string if VERBOSE constant is set
*
* @param string $string string to output
* @param optional mixed $data to dump
* @return boolean true if string output, false if not
*/
function verbose($string, $data = [])
{
if (VERBOSE && !empty($string)) {
output(trim('[V' . ((DEBUG) ? ' ' . get_memory_used() : '') . '] ' . $string) . "\n");
if (!empty($data)) {
output(print_r($data, 1));
}
return true;
}
return false;
}
/**
* Clear an array of empty values
*
* @param array $keys array keys to explicitly remove regardless
* @return array the trimmed down array
*/
function array_clear($array, $keys = [])
{
foreach ($array as $key => $value) {
if (is_array($value)) {
do {
$oldvalue = $value;
$value = array_clear($value, $keys);
}
while ($oldvalue !== $value);
$array[$key] = array_clear($value, $keys);
}
if (empty($value) && 0 !== $value) {
unset($array[$key]);
}
if (in_array($key, $keys, true)) {
unset($array[$key]);
}
}
return $array;
}
/**
* Encode array character encoding recursively
*
* @param mixed $data
* @param string $to_charset convert to encoding
* @param string $from_charset convert from encoding
* @return mixed
*/
function to_charset($data, $to_charset = 'UTF-8', $from_charset = 'auto')
{
if (is_numeric($data)) {
if (is_float($data)) {
return (float) $data;
} else {
return (int) $data;
}
} else if (is_string($data)) {
return mb_convert_encoding($data, $to_charset, $from_charset);
} else if (is_array($data)) {
foreach ($data as $key => $value) {
$data[$key] = to_charset($value, $to_charset, $from_charset);
}
} else if (is_object($data)) {
foreach ($data as $key => $value) {
$data->$key = to_charset($value, $to_charset, $from_charset);
}
}
return $data;
}
/**
* Load a serialized php data file and return it
*
* @param string $filename the json filename
* @return array $data
*/
function serialize_load($file)
{
if (file_exists($file)) {
$data = unserialize(file_get_contents($file));
$data = array_clear($data);
}
return empty($data) ? [] : $data;
}
/**
* Save data array to a php serialized data
*
* @param string $filename the json filename
* @param array $data data to save
* @return boolean result
*/
function serialize_save($file, $data)
{
if (empty($data)) {
return 'No data to write to file.';
}
$data = array_clear($data);
return file_put_contents($file, serialize($data));
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment