Skip to content

Instantly share code, notes, and snippets.

@yrezgui
Last active August 29, 2015 14:02
Show Gist options
  • Select an option

  • Save yrezgui/224154e642951d96e834 to your computer and use it in GitHub Desktop.

Select an option

Save yrezgui/224154e642951d96e834 to your computer and use it in GitHub Desktop.
var mongodb = require('mongodb');
var client = mongodb.MongoClient;
var request = require('request');
var async = require('async');
var API_JOBS_URL = 'https://api.angel.co/1/jobs';
var MAX_CONCURRENT_WORKERS = 5;
function databaseConnect(callback) {
client.connect('mongodb://127.0.0.1:27017/shigoto', function(error, db) {
if (error) {
return callback(error, null);
}
callback(null, db);
});
}
function crawler(db, finalCallback) {
var worker = async.queue(function process(task, callback) {
return request({
url: task.url,
qs: {page: task.page},
method: 'GET',
json: true
}, function finish(error, response, body) {
if(error) {
return callback(error);
}
response.body.jobs.forEach(function(job) {
job._id = job.id;
delete job.id;
});
db.collection('jobs').insert(response.body.jobs, function(error, result) {
if(error) {
return callback(error);
}
console.log('Page ' + task.page + ' processed');
callback();
});
});
}, MAX_CONCURRENT_WORKERS);
worker.drain = function drain() {
finalCallback('all items have been processed');
};
// First request help us to know how many pages there are
request({
url: API_JOBS_URL,
method: 'GET',
json: true
}, function finish(error, response, body) {
if(error) {
throw error;
}
for (var i = response.body.last_page; i >= 1; i--) {
worker.push({ url: API_JOBS_URL, page: i });
}
});
}
async.waterfall([databaseConnect, crawler], function finish(error, result) {
console.log('enfin');
console.log(error, result);
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment