Skip to content

Instantly share code, notes, and snippets.

@rafinskipg
Created December 5, 2014 13:02
Show Gist options
  • Save rafinskipg/250c6cfca996fed660ac to your computer and use it in GitHub Desktop.
Save rafinskipg/250c6cfca996fed660ac to your computer and use it in GitHub Desktop.
Betabeers CV scrapper
var request = require('request');
var _ = require('lodash');
var fs = require('graceful-fs');
var cheerio = require('cheerio');
function getCvs(){
request('https://betabeers.com/static/uploads/cv/', function (error, response, body) {
if (!error && response.statusCode == 200) {
parseResponse(body, 'files','http://betabeers.com/static/uploads/cv/') // Print the google web page.
}
})
}
function getInvoices(){
request('https://betabeers.com/static/uploads/invoices/', function (error, response, body) {
if (!error && response.statusCode == 200) {
parseResponse(body, 'invoices','http://betabeers.com/static/uploads/invoices/') // Print the google web page.
}
})
}
function parseResponse(body, folder, baseUrl){
var $ = cheerio.load(body);
console.log('Getting elements');
var elements =$('a');
var routes = _.compact(_.keys(elements).map(function(key){
if(elements[key].attribs){
return elements[key].attribs.href;
}
}));
requestFiles(routes, folder, baseUrl);
}
function requestFiles(routes, folder, baseUrl){
var requests = routes;
var maxRequests = 10;
var processedfiles = [];
function processFile(){
if(routes.length > 0){
var file = routes.pop();
if(file.indexOf('.pdf') != -1 ){
processedfiles.push(request(baseUrl+ file).pipe(fs.createWriteStream(folder+'/'+file))
.on('finish', function(){
console.log('File procesed '+ file);
processFile();
}));
}
}else{
console.log('End')
}
}
for(var i = 0; i < maxRequests; i++){
processFile();
}
}
getCvs();
getInvoices();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment