Skip to content

Instantly share code, notes, and snippets.

@Fabryz
Created April 25, 2012 21:35
Show Gist options
  • Save Fabryz/2493656 to your computer and use it in GitHub Desktop.
Save Fabryz/2493656 to your computer and use it in GitHub Desktop.
Save all images contained in a website domain/paths
{
"name": "rakeup",
"version": "0.0.1",
"author": "Fabrizio Codello",
"engines": {
"node": "0.6.x"
},
"private": true,
"dependencies": {
"http-agent": "0.1.x",
"jsdom": "0.2.x"
}
}
var httpAgent = require('http-agent'),
jsdom = require('jsdom'),
http = require('http'),
url = require('url'),
fs = require('fs'),
exec = require('child_process').exec,
child;
// check if directory exists
function dirExistsSync(dir) {
try {
return fs.statSync(dir).isDirectory();
} catch (err) {
return false
}
}
// retrieve a resource and save it
function saveImage(options, name) {
http.get(options, function(res) {
var imageData = '';
res.setEncoding('binary');
res.on('data', function(chunk){
imageData += chunk;
})
res.on('end', function() {
fs.writeFile(savePath + name, imageData, 'binary', function(err) {
if (err) throw err;
});
});
}).on('error', function(e) {
console.log("Got error: " + e.message);
});
}
var domain = 'boards.4chan.org',
domainPaths = ['v', 'sci'],
savePath = './images/',
count = 0;
var agent = httpAgent.create(domain, domainPaths);
agent.addListener('next', function (err, agent) {
jsdom.env({
html: agent.body,
scripts: [
'http://code.jquery.com/jquery-1.7.2.min.js'
]
}, function (err, window) {
var $ = window.jQuery;
// grab all image URLs and save them on savePath
$('img').each(function(index) {
var imgSrc = $(this).attr('src');
// add http: as by default the path is //path
var imgPath = (~imgSrc.indexOf('http:') ? imgSrc : 'http:'+ imgSrc);
var fullURL = url.parse(imgPath);
var options = {
host: fullURL.host,
port: 80,
path: fullURL.path
};
var paths = fullURL.path.split('/'),
fileName = paths[paths.length - 1];
saveImage(options, fileName);
count++;
});
agent.next();
});
});
agent.addListener('stop', function(agent) {
console.log('* '+ count +' images saved.');
console.log('* Done.');
});
// create savePath if does not exists
if (!dirExistsSync(savePath)) {
fs.mkdir(savePath, function(err) {
if (err) throw err;
});
} else {
// if exists remove all directory contents
child = exec('rm -rf '+ savePath +'*');
}
console.log('* Grabbing all images from '+ domain +'.');
console.log('* Paths: /'+ domainPaths.join(' /') +'.');
console.log('* Download dir: '+ savePath);
agent.start();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment