Created
June 16, 2011 06:38
-
-
Save mrsinguyen/1028786 to your computer and use it in GitHub Desktop.
Github search scraping
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
Usage: io github <query> [language] | |
io github django | |
io github coffeescript | |
To limit search results to a certain language: | |
io github django python | |
To see debug info: | |
io --debug github django | |
*/ | |
var nodeio = require('node.io'), search_url, added_additional = false; | |
exports.job = new nodeio.Job({max: 50, retries: 3, auto_retry: true}, { | |
init: function () { | |
var query = '', language = ''; | |
//Parse command line args | |
switch (this.options.args.length) { | |
case 0: | |
console.log('node.io github <query> [language]'); | |
process.exit(); | |
case 2: language = this.options.args[1]; | |
case 1: query = this.options.args[0]; | |
} | |
//Build the base search URL | |
search_url = 'https://github.com/search?type=Repositories&language=' | |
+ language + '&q=' + query | |
+ '&repo=&langOverride=&x=0&y=0&start_value='; | |
//The initial input is page 1 of search results | |
this.input = [search_url + 1]; | |
}, | |
run: function (search_page) { | |
this.getHtml(search_page, function(err, $) { | |
//Add additional pages of search results to the input queue (only once) | |
if (!added_additional) { | |
var page, total_pages = $('.pager_link').last().text; | |
for (page = 2; page < total_pages; page++) { | |
this.add(search_url + page); | |
} | |
added_additional = true; | |
} | |
//Scrape projects on the page and emit | |
var projects = []; | |
$('.result').each(function (listing) { | |
var project = {}, title, language; | |
title = $('h2 a', listing).fulltext; | |
language = $('.language', listing).fulltext; | |
project.author = title.substring(0, title.indexOf(" / ")); | |
project.title = title.substring(title.indexOf(" / ") + 3); | |
project.link = "https://github.com" + $('h2 a', listing).attribs.href; | |
project.language = language.substring(1, language.length - 1); | |
project.description = $('.description', listing).fulltext; | |
projects.push(project); | |
}); | |
this.emit(projects); | |
}); | |
} | |
}); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment