Skip to content

Instantly share code, notes, and snippets.

@zeen
Created June 20, 2010 23:54
Show Gist options
  • Save zeen/446234 to your computer and use it in GitHub Desktop.
Save zeen/446234 to your computer and use it in GitHub Desktop.
<?php
/**
* A command line interface for automatic downloads of image feeds
*
* @package easypopulate
* @author Waqas Hussain
* @copyright 2010
* @license http://www.gnu.org/licenses/gpl-2.0.html GNU General Public License (v2 only)
*/
$json_file = "sample.json"; // json, formatted as [{...}, {...}, ...]
$image_root = "./out"; // folder in which the output goes
$zip_root = "./zips"; // folder in which zips are cached
$tmpfile = "$zip_root/tmp.zip"; // temporary file (currently downloading zip)
function debug($message) { echo("debug: $message\n"); }
function warn ($message) { echo("warn: $message\n"); }
function error($message) { echo("error: $message\n"); }
function fatal($message) { die($message); }
function load_json($file) {
$json = file_get_contents($file) or fatal("failed to load json file");
$json = json_decode($json) or fatal("failed to parse json file");;
return $json;
}
function get_remote_file_time($url) {
$result = FALSE;
if ($ch = curl_init($url)) {
curl_setopt($ch, CURLOPT_NOBODY, 1); // HEAD request
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); // don't print to stdout
curl_setopt($ch, CURLOPT_FAILONERROR, 1);
curl_setopt($ch, CURLOPT_FILETIME, 1);
$_ = curl_exec($ch);
if(!curl_errno($ch)) {
$result = curl_getinfo($ch, CURLINFO_FILETIME);
}
curl_close($ch);
} else error("failed to intialize curl");
return $result;
}
function get_useful_zip_entries($zip, $destination, $filter_prefix) {
$filter_prefix = $filter_prefix or "";
$files = array();
$sets = array();
for($i = 0; $i < $zip->numFiles; $i++) {
$entry = $zip->getNameIndex($i);
if (strpos($entry, $filter_prefix) == 0) {
// FIXME: Validate path validity?
if (preg_match('/^(.*)_(\d+).jpg$/', $entry, $matches) > 0) {
$setname = $matches[1]; $size = intval($matches[2]);
if (!(isset($sets[$setname]) && $sets[$setname][0] > $size)) {
$sets[$setname] = array($size, $entry);
}
} else {
$path = $destination."/".substr($entry, strlen($filter_prefix));
$files[$entry] = $path;
}
}
}
foreach($sets as $k => $v) {
$entry = $v[1];
$path = $destination."/".substr($entry, strlen($filter_prefix));
$files[$entry] = $path;
}
return $files;
}
function is_file_younger_than_url($url, $filename) {
if (($last_modified = @filemtime($filename)) !== FALSE) { // file exists
$remote_time = get_remote_file_time($url);
if ($remote_time == -1) {
debug("Server didn't provide last-modified time");
} elseif ($remote_time !== FALSE && $remote_time <= $last_modified) {
debug("No new file on server, skipping");
return TRUE; // skip extraction
}
}
return FALSE;
}
function download_file($url, $filename) {
debug("Downloading $filename ($url)");
$result = FALSE;
if ($ch = curl_init($url)) {
if ($fp = fopen($filename, "w")) {
curl_setopt($ch, CURLOPT_FILE, $fp);
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_FAILONERROR, 1);
curl_setopt($ch, CURLOPT_FILETIME, 1);
$result = curl_exec($ch);
fclose($fp);
if ($result) {
if (($remote_time = curl_getinfo($ch, CURLINFO_FILETIME)) !== -1) {
touch($filename, $remote_time);
}
} else error("download failed");
} else error("failed to open $filename for writing");
curl_close($ch);
} else error("failed to intialize curl");
return $result;
}
function extract_file($filename, $destination, $filter_prefix) {
debug("Extracting $filename to $destination");
$result = FALSE;
$zip = new ZipArchive;
if ($zip->open($filename) === TRUE) {
$files = get_useful_zip_entries($zip, $destination, $filter_prefix);
foreach ($files as $inner_path => $outer_path) {
if ($content = $zip->getFromName($inner_path)) {
$directory = pathinfo($outer_path, PATHINFO_DIRNAME);
if (!file_exists($directory)) mkdir($directory, 0777, TRUE);
file_put_contents($outer_path, $content);
}
}
$zip->close();
return TRUE;
} else error("failed to open zip for reading");
return $result;
}
function unextract_file($filename, $destination, $filter_prefix) {
debug("Unextracting $filename to $destination");
$result = FALSE;
$zip = new ZipArchive;
if ($zip->open($filename) === TRUE) {
$files = get_useful_zip_entries($zip, $destination, $filter_prefix);
foreach ($files as $inner_path => $outer_path) {
if (file_exists($outer_path))
unlink($outer_path);
}
$zip->close();
return TRUE;
} else error("failed to open zip for reading");
return $result;
}
function download($provider) {
global $image_root, $zip_root, $tmpfile;
$name = $provider->name;
if (!property_exists($provider, "images_url")) {
debug("skipping: $name, no images_url");
return;
}
$images_url = $provider->images_url;
$images_file_path = $provider->images_file_path;
$image_path_prefix = $provider->image_path_prefix;
// FIXME escape/validate paths/filenames?
$zip_file = "$zip_root/$name.zip";
$image_folder = "$image_root/$image_path_prefix";
debug("processing: $name");
if (!is_file_younger_than_url($images_url, $zip_file)) {
if (download_file($images_url, $tmpfile)) {
debug("downloaded file: $zip_file");
if (file_exists($zip_file)) {
unextract_file($zip_file, $image_folder, $images_file_path);
}
rename($tmpfile, $zip_file); // todo: last modified
if (extract_file($zip_file, $image_folder, $images_file_path)) {
debug("extracted file: $zip_file");
}
}
}
}
$json = load_json($json_file);
foreach($json as $i => $provider) {
download($provider);
debug("");
}
?>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment