Created
June 1, 2015 21:55
-
-
Save yurivictor/4984ef873efb5146de08 to your computer and use it in GitHub Desktop.
Example scraper in node (scraping rand paul's site for issues)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var cheerio = require('cheerio'); | |
var request = require('request'); | |
var fs = require('fs'); | |
var s, | |
Scrape = { | |
settings: { | |
// Object, the json to be output | |
json: {}, | |
// String, the url to scrape | |
domain: 'https://randpaul.com/issues', | |
// Int, an iterator for use later | |
iterator: 0 | |
}, | |
init: function() { | |
// Globalize settings | |
s = this.settings; | |
// Start scraper | |
request( s.domain, this.scrapePage ); | |
// Wait 10 seconds to scrape and then output the JSON | |
setTimeout( Scrape.outputJSON, 10000 ); | |
}, | |
scrapePage: function( error, resp, html ) { | |
// Init cheerio | |
var $ = cheerio.load( html ); | |
// Get each issue | |
$( '.short-article a' ).each( function ( i, elem ) { | |
// Up the json | |
s.json[i] = { 'issue': '', 'text': '' }; | |
// Set the issue | |
s.json[i]['issue'] = $( this ).text(); | |
// Get the text | |
request( $( this ).attr( 'href' ), Scrape.getText ); | |
} ); | |
}, | |
getText: function( error, resp, html ) { | |
// Init cheerio | |
var $ = cheerio.load( html ); | |
// Set the text | |
s.json[s.iterator]['text'] = $( '.article-text' ).text(); | |
// Increase the iterator | |
s.iterator++; | |
}, | |
outputJSON: function() { | |
fs.writeFile( 'output.json', JSON.stringify( s.json, null, 4 ), function( error ) { | |
console.log( 'File successfully written' ); | |
} ); | |
} | |
}; | |
(function() { | |
Scrape.init(); | |
})(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment