Created
September 10, 2017 22:30
-
-
Save tommelo/135b150cefded0e5dd7ad1478ec83c3b to your computer and use it in GitHub Desktop.
Simple Scraper
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
'use strict'; | |
var request = require('request-promise'); | |
var cheerio = require('cheerio'); | |
/** | |
* The default constructor | |
*/ | |
function Scraper() { | |
} | |
/** | |
* Returns a http get request promise | |
* | |
* @param {Object} req The request definition | |
* @return {Promise} promise The get request promise | |
*/ | |
Scraper.prototype.get = function(req) { | |
return request(req); | |
} | |
/** | |
* Returns a cheerio instance loaded with the given html body | |
* | |
* @param {String} body The html body | |
* @return {Function} $ The cheerio instance | |
*/ | |
Scraper.prototype.load = function(body) { | |
return cheerio.load(body); | |
} | |
/** | |
* Returns a promise of a request call | |
* | |
* @param {String} uri The request uri | |
* @param {Object} qs The request query string | |
* @return {Promise} promise The request promise | |
*/ | |
Scraper.prototype.prepare = function(uri, qs) { | |
var req = { | |
uri: uri, | |
qs: qs, | |
simple: false, | |
transform: this.load, | |
followRedirects: false | |
}; | |
return this.get(req).then(this.scrape); | |
} | |
/** | |
* Executes the scrape chain: | |
* request -> cheerio load -> scrape -> transform | |
* | |
* @param {Array} promises The promises array | |
* @return {Promise} promise The execution result | |
*/ | |
Scraper.prototype.execute = function(promises) { | |
var self = this; | |
return Promise.all(promises).then(function(result) { | |
return self.transform(result); | |
}); | |
} | |
/** | |
* The scrape method. | |
* This is the abstract method that should be implemented by all the scrapers. | |
* It performs a html body scrape. | |
* | |
* @param {Function} $ The cheerio instance | |
* @return {Array} result The scrape result | |
*/ | |
Scraper.prototype.scrape = function($) { | |
throw new TypeError('scrape() method must be implemented'); | |
} | |
/** | |
* The transform method. | |
* This is the abstract method that should be implemented by all the scrapers. | |
* It transform the scrape result in a array of objects. | |
* | |
* @param {Array} result The scrape result | |
* @return {Array} normalized An array of normalized data | |
*/ | |
Scraper.prototype.transform = function(result) { | |
throw new TypeError('transform() method must be implemented'); | |
} | |
module.exports = Scraper; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment