Created
May 8, 2015 10:17
-
-
Save 256cats/c84b602d3ea8253d81aa to your computer and use it in GitHub Desktop.
Instagram (API) popular scraper and downloader with redis and curl, description here: http://256cats.com/how-to-scrape-instagram-and-quickly-download-images/
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
$dir = __DIR__.'/photos'; | |
$redis = new Redis(); | |
$redis->connect('127.0.0.1', 6379); | |
function get($url) { | |
//curl get | |
echo $url."\n"; | |
$curlOptions = array( | |
CURLOPT_ENCODING => 'gzip,deflate', | |
CURLOPT_AUTOREFERER => 1, | |
CURLOPT_CONNECTTIMEOUT => 10, // timeout on connect | |
CURLOPT_TIMEOUT => 10, // timeout on response | |
CURLOPT_URL => $url, | |
CURLOPT_SSL_VERIFYPEER => false, | |
CURLOPT_SSL_VERIFYHOST => false, | |
CURLOPT_FOLLOWLOCATION => true, | |
CURLOPT_MAXREDIRS => 9, | |
CURLOPT_RETURNTRANSFER => 1, | |
CURLOPT_HEADER => 0, | |
CURLOPT_USERAGENT => "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36", | |
CURLOPT_VERBOSE => true, | |
CURLINFO_HEADER_OUT => true, | |
); | |
$curl = curl_init(); | |
curl_setopt_array($curl, $curlOptions); | |
$data = curl_exec($curl); | |
curl_close($curl); | |
return $data; | |
} | |
while(true) { | |
$item = $redis->brPop('photo:queue', 10); // wait until we get new item from Redis | |
$retry = 1; | |
if($item) { | |
$item = unserialize($item[1]); | |
$filename = $dir.'/'.$item['filename']; | |
if(!file_exists($filename)) { | |
while(!($photo = get($item['images']->standard_resolution->url))) { | |
echo "retrying download {$retry}\n"; | |
sleep(2); | |
$retry++; | |
} | |
file_put_contents($filename, $photo); | |
echo "Loaded {$filename}\n"; | |
} | |
} else { | |
echo "no items in Redis\n"; | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
require_once 'vendor/autoload.php'; | |
use MetzWeb\Instagram\Instagram; | |
date_default_timezone_set('UTC'); | |
$redis = new Redis(); | |
$redis->connect('127.0.0.1', 6379); | |
$instagram = new Instagram(array( | |
'apiKey' => 'YOUR_APP_KEY', | |
'apiSecret' => 'YOUR_APP_SECRET', | |
)); | |
$accessToken = 'YOUR_ACCESS_TOKEN'; | |
$instagram->setAccessToken($accessToken); | |
$search = $instagram->getPopularMedia(); | |
$data = $search->data; | |
foreach($data as $d) { | |
if($d->type == 'image') { | |
$item = array( | |
'images' => $d->images, | |
'caption' => $d->caption, | |
'created_time' => $d->created_time, | |
'id' => $d->id, | |
'filename' => $id.'.jpg' | |
); | |
$redis->lPush('photo:queue', serialize($item)); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment