Skip to content

Instantly share code, notes, and snippets.

@maietta
Created July 17, 2017 18:33
Show Gist options
  • Save maietta/5a894b27804114085bafe0bed07cbf28 to your computer and use it in GitHub Desktop.
Save maietta/5a894b27804114085bafe0bed07cbf28 to your computer and use it in GitHub Desktop.
Build an inventory of PDF data in a SQL database, via PDO
<?php
$group = "pcldtmdv";
//$years = range(2002, 2012);
$years = range(2013, 2018);
try {
$db = new PDO('mysql:dbname=arb01_data;host=localhost', 'arb01_automation', '6@yyM^cl42!0s44ai5');
$db->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION);
foreach($years as $year) {
$glob = glob("/home/commnetivity/arb01/Archive/$group/$year/*amp*.pdf");
foreach($glob as $file) {
echo $file . "\n";
}
}
foreach($years as $year) {
$glob = glob("/home/commnetivity/arb01/Archive/$group/$year/*amp*.pdf.pdf");
foreach($glob as $file) {
echo $file . "\n";
}
}
# sha1_file, group, year, filename, pages, info
foreach($years as $year) {
$glob = glob("/home/commnetivity/arb01/Archive/$group/$year/*.pdf");
foreach($glob as $file) {
if(strpos($file, "&") !== false) {
file_put_contents("TheHessFiles.txt", $file, FILE_APPEND);
next;
}
if(substr(basename($file), 0, 4) == "hess") {
file_put_contents("TheHessFiles.txt", $file, FILE_APPEND);
next;
}
echo $file . "\n";
$sha1_file = sha1_file($file);
$size_on_disk = filesize($file);
$sql = "SELECT 1 AS `row_exists` FROM pdf_inventory WHERE sha1_file = :sha1_file";
$stmt = $db->prepare($sql);
$stmt->bindParam(':sha1_file', $sha1_file);
$stmt->execute();
if($stmt->fetch()) {
// echo "$sha1_file already exists... ";
} else {
/* Record not found. INSERT */
$filename = basename($file);
$info = shell_exec("pdfinfo $file");
$raw = explode("\n", $info);
$data = array();
$exploded = preg_split("/\\r\\n|\\r|\\n/", $info, -1, PREG_SPLIT_NO_EMPTY);
foreach ($exploded as $token) {
$keyValue = preg_split("/:/", $token, 2);
$var = str_replace(" ", "_", strtolower($keyValue[0]));
if(strpos(strtoupper($keyValue[1]), 'ORDER') !== false) {
$data["order"] = preg_split("/:/", $keyValue[1], 2)[1];
}
if(strpos($var, 'creationdate') !== false) {
$data["created"] = date('Y-m-d H:i:s', strtotime($keyValue[1]));
}
if(strpos($var, 'moddate') !== false) {
$data["last_modified"] = date('Y-m-d H:i:s', strtotime($keyValue[1]));
}
if(strpos($var, 'file_size') !== false) {
$data["filesize"] = strtotime($keyValue[1]);
}
$data[$var] = trim($keyValue[1]);
}
if ( isset($data["pagecount"]) ) {
$data["pagecount"] = "NULL";
}
$sql = $db->prepare("INSERT INTO `pdf_inventory` (`sha1_file`, `group`, `year`, `order`, `filename`, `pages`, `created`, `last_modified`, `filesize`, `page_size`) VALUES (:sha1_file, :group, :year, :order, :filename, :pages, :created, :last_modified, :filesize, :page_size)");
$sql->bindParam(':sha1_file', $sha1_file);
$sql->bindParam(':group', $group);
$sql->bindParam(':year', $year);
$sql->bindParam(':order', $data["order"]);
$sql->bindParam(':filename', $filename);
$sql->bindParam(':pages', $data["pages"]);
$sql->bindParam(':created', $data["created"]);
$sql->bindParam(':last_modified', $data["last_modified"]);
$sql->bindParam(':filesize', $size_on_disk);
$sql->bindParam(':page_size', $data["page_size"]);
$sql->execute();
echo $file . "\n";
}
$stmt->closeCursor();
}
}
} catch (PDOException $e) {
echo 'Connection failed: ' . $e->getMessage();
exit;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment