Skip to content

Instantly share code, notes, and snippets.

@sang4lv
Last active December 20, 2015 13:29
Show Gist options
  • Save sang4lv/6138700 to your computer and use it in GitHub Desktop.
Save sang4lv/6138700 to your computer and use it in GitHub Desktop.
Node JS Webpage Scraper
var url = require("url");
var http = require("http");
var jsdom = require("jsdom");
var request = require("request");
var server = http.createServer( function( top_request, top_response ) {
var pathname = top_request.url.split("/");
var uri_path = "";
if( "author" === pathname[1] &&
pathname[2] ) {
uri_path = "http://profiles.wordpress.org/" + pathname[2];
}
request( {
uri: uri_path,
}, function( error, response, body ) {
if( error || response.statusCode !== 200 ) {
console.log( "Request Error" );
}
jsdom.env( {
html: body,
scripts: ['http://code.jquery.com/jquery-1.10.2.min.js'],
done: function( error, window ) {
var $ = window.jQuery;
var output = {
plugins: [],
themes: [],
};
$("#main-column .main-plugins li a").each( function( index, element ) {
output.plugins.push( $(element).prop("href") );
} );
$("#main-column .main-themes li a").each( function( index, element ) {
output.themes.push( $(element).prop("href") );
} );
top_response.writeHead( 200, { "Content-type": "text/plain" } );
top_response.end( JSON.stringify(output) );
}
} );
} );
} );
server.listen(8000);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment