Created
June 27, 2012 21:28
-
-
Save marcuswestin/3006984 to your computer and use it in GitHub Desktop.
Stay up to date on funding rounds
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// sudo npm install request && sudo npm install jsdom | |
var request = require('request') | |
var jsdom = require('jsdom') | |
var pages = process.argv.slice(2) | |
if (!pages.length) { pages = [0] } | |
console.error('Scrape Crunchbase funding rounds. Pages:', pages) | |
var pagesData = [] | |
for (var i=0; i<pages.length; i++) { | |
;(function next(page) { | |
console.error("Scrape page:", page) | |
jsdom.env({ | |
html:'http://www.crunchbase.com/funding-rounds?page='+page+'&q=all', | |
scripts:'http://code.jquery.com/jquery-1.7.2.min.js', | |
done: function(errors, win) { | |
var $ = win.$ | |
var t = function(el) { return $.trim($(el).text()) } | |
console.log("\n\nPage", page+':') | |
var data = [] | |
$('#col2_internal tr').each(function(i) { | |
if (i == 0) { return } // header | |
var td = $(this).find('td') | |
var round = { | |
company: t($(td[1]).find('a')), | |
round: t(td[2]), | |
amount: t(td[3]), | |
investors: $.map($(td[4]).find('a'), function(a) { return t(a) }) | |
} | |
data.push(round) | |
console.log(round.company, round.amount, round.round, '('+round.investors.join(', ')+')') | |
}) | |
pagesData.push(data) | |
if (pagesData.length == pages.length) { | |
console.error("Done!") | |
return process.exit(0) | |
} | |
} | |
}) | |
})(pages[i]); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment