-
-
Save cizario/5736144 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
// See https://lazycat.org/php-curl.html for license & known issues | |
function httpGet($url, $ttl = 86400) | |
{ | |
/* Change this or make it an option as appropriate. If you're | |
* getting urls that shouldn't be visible to the public, put the | |
* cache folder somewhere it can't be accessed from the web | |
*/ | |
$cache_path = dirname(__FILE__).'/cache'; | |
/* Check the cache first - setting ttl to 0 overrides | |
* the check. I'm using crc32() to make URLs safe here; if you're | |
* fetching millions of URLs, it might not be different enough to | |
* avoid clashes. If you get collisions, use md5() or something, | |
* and change the sprintf() pattern. | |
*/ | |
$cache_file = sprintf('%s/%08X.dat', $cache_path, crc32($url)); | |
$cache_exists = is_readable($cache_file); | |
/* If the cache is newer than the Time To Live, return it | |
* instead of doing a new request. The default TTL is 1 day. | |
*/ | |
if ($ttl && $cache_exists && | |
(filemtime($cache_file) > (time() - $ttl)) | |
) | |
{ | |
return file_get_contents($cache_file); | |
} | |
/* Need to regenerate the cache. First thing to do here is update | |
* the modification time on the cache file so that no one else | |
* tries to update the cache while we're updating it. | |
*/ | |
touch($cache_file); | |
clearstatcache(); | |
/* Set up the cURL pointer. It's important to set a User-Agent | |
* that's unique to you, and provides contact details in case your | |
* script is misbehaving and a server owner needs to contact you. | |
* More than that, it's just the polite thing to do. | |
*/ | |
$c = curl_init(); | |
curl_setopt($c, CURLOPT_URL, $url); | |
curl_setopt($c, CURLOPT_TIMEOUT, 15); | |
curl_setopt($c, CURLOPT_RETURNTRANSFER, true); | |
curl_setopt($c, CURLOPT_USERAGENT, | |
'ExampleFetcher/0.9 (http://example.com/; [email protected])'); | |
/* If we've got a cache, do the web a favour and make a | |
* conditional HTTP request. What this means is that if the | |
* server supports it, it will tell us if nothing has changed - | |
* this means we can reuse the cache for a while, and the | |
* request is returned faster. | |
*/ | |
if ($cache_exists) { | |
curl_setopt($c, CURLOPT_TIMECONDITION, CURL_TIMECOND_IFMODSINCE); | |
curl_setopt($c, CURLOPT_TIMEVALUE, filemtime($cache_file)); | |
} | |
/* Make the request and check the result. */ | |
$content = curl_exec($c); | |
$status = curl_getinfo($c, CURLINFO_HTTP_CODE); | |
// Document unmodified? Return the cache file | |
if ($cache_exists && ($status == 304)) { | |
return file_get_contents($cache_file); | |
} | |
/* You could be more forgiving of errors here. I've chosen to | |
* fail hard instead, because at least it'll be obvious when | |
* something goes wrong. | |
*/ | |
if ($status != 200) { | |
throw new Exception(sprintf('Unexpected HTTP return code %d', $status)); | |
} | |
/* If everything is fine, save the new cache file, make sure | |
* it's world-readable, and writeable by the server | |
*/ | |
file_put_contents($cache_file, $content); | |
chmod($cache_file, 0644); | |
return $content; | |
} | |
?> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment