Created
February 29, 2016 06:47
-
-
Save dezull/e039bbfbc8157e27e016 to your computer and use it in GitHub Desktop.
Scrape & archive webpage
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var fs = require('fs-extra'), | |
_ = require('lodash'), | |
Promise = require('bluebird'), | |
archiver = require('archiver'), | |
scraper = require('website-scraper'); | |
var defaultOptions = { | |
'User-Agent': 'Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 4 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19' | |
}; | |
function zip(options) { | |
var zipPath = options.directory + '.zip'; | |
var zipFile = fs.createWriteStream(zipPath); | |
var archive = function() { | |
return new Promise(function(resolve, reject) { | |
zipFile | |
.on('finish', resolve) | |
.on('error', reject); | |
archiver | |
.create('zip', {}) | |
// It's very important we set the same date for each file, | |
// to ensure with zip file containing similar files will have | |
// the same checksum | |
.directory(options.directory, '/', { date: new Date(0) }) | |
.on('error', reject) | |
.finalize() | |
.pipe(zipFile) | |
; | |
}); | |
}; | |
var removeDirectory = function() { | |
return new Promise(function(resolve, reject) { | |
fs.remove(options.directory, function(e) { | |
if (!e) return resolve(); | |
reject(e); | |
}); | |
}); | |
}; | |
return archive() | |
.catch(function(e) { | |
// Catch zipping error, remove the directory regardless, | |
// and rethrow the zipping error | |
return removeDirectory().then(function() { | |
throw e; | |
}); | |
}) | |
.then(function() { return removeDirectory(); }) | |
.then(function(a) { | |
return Promise.resolve(zipPath); | |
}) | |
; | |
} | |
function scrape(options) { | |
return scraper.scrape({ | |
urls: [ options.url ], | |
directory: options.directory, | |
request: { | |
headers: { | |
'User-Agent': options['User-Agent'] | |
} | |
} | |
}); | |
} | |
/** | |
* Create a scraper. | |
* | |
* factoryOptions: | |
* { | |
* directory: baseDirectoryForWebpage (required) | |
* } | |
* | |
* @param {Object} factoryOptions See above | |
* @return {Object} scraper object | |
*/ | |
exports.create = function(factoryOptions) { | |
return { | |
scrape: function(options) { | |
_.defaults(options, factoryOptions, defaultOptions); | |
options.directory = options.directory + '/' + options.id; | |
return scrape(options) | |
.then(function() { return zip(options); }) | |
; | |
} | |
}; | |
}; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment