Skip to content

Instantly share code, notes, and snippets.

@gabrielstuff
Created September 12, 2012 17:08
Show Gist options
  • Select an option

  • Save gabrielstuff/3708207 to your computer and use it in GitHub Desktop.

Select an option

Save gabrielstuff/3708207 to your computer and use it in GitHub Desktop.
Scrapper for the website societe.com
var nodeio = require('node.io');
var main_url = 'http://www.societe.com';
var options = {
max: 20,
timeout: 20,
jsdom: true
};
exports.job = new nodeio.Job(options, {
input: [{
type: 'initial',
url: main_url + '/cgi-bin/mainsrch?champ=' + encodeURIComponent(process.argv[3])
}],
run: function(input) {
var self = this;
var output = [];
if(input.type === 'initial') {
this.getHtml(input.url, function(err, $) {
//Handle any request / parsing errors
if(err) this.exit(err);
try {
$('table.GlobalTable a.Blue12').each(function(index, el) {
var url_ = main_url + el.href;
self.add({
type: 'subrequest',
url: main_url + el.href,
name: $(el).text()
});
self.skip();
});
} catch(err) {
console.log(err);
}
});
} else {
//console.log(self.assigned_input.name);
this.getHtml(input.url, function(err, $) {
if(err) this.exit(err);
try {
var siret = "",
ca_soc = "",
siege_soc = "",
matriculation = "",
categorie = "",
juridique = "",
ca_ = "",
ceo_ = [];
var foundin = $('td:contains("Forme juridique"):last');
foundin.parent().parent().find('td').each(function(index, el) {
if($(el).text().indexOf("social") != -1) {
$(el).parent("tr").children().each(function(index, el) {
if(index == 1) siege_soc = $(el).text().trim();
});
}
if($(el).text().indexOf("juridique") != -1) {
$(el).parent("tr").children().each(function(index, el) {
if(index == 1) juridique = $(el).text().trim();
});
}
if($(el).text().indexOf("gorie") != -1) { //for categorie
$(el).parent("tr").children().each(function(index, el) {
if(index == 1) categorie = $(el).text().trim();
});
}
});
var foundin = $('td:contains("Immatriculation"):last');
foundin.parent().parent().find('td').each(function(index, el) {
if($(el).text().indexOf("SIRET") != -1) {
$(el).parent("tr").children().each(function(index, el) {
if(index == 1) {
siret = $(el).text().trim();
}
});
}
if($(el).text().indexOf("Capital social") != -1) {
$(el).parent("tr").children().each(function(index, el) {
ca_soc = $(el).text().trim();
});
}
if($(el).text().indexOf("matriculation") != -1) { //for categorie
$(el).parent("tr").children().each(function(index, el) {
if(index == 1) matriculation = $(el).text().trim();
});
}
});
var foundin = $('td > div > b:contains("Dirigeants"):last');
foundin.closest("td").children('table:first').find('td').each(function(index, el) {
if(($(el).text().indexOf("ant") != -1) || //Gérant
($(el).text().indexOf("ateur") != -1) || //Administrateur
($(el).text().indexOf("DG") != -1) || //DG
($(el).text().indexOf("dent") != -1)) { // Président
$(el).parent("tr").children().each(function(index, el) {
if(index == 1) {
ceo_.push($(el).text().trim());
}
});
}
});
output = "[" + $("h1").text() + "] " + this.last.url + " | " + siret + " | " + ca_soc + " | " + categorie + " | " + ceo_.join(" ") + " | " + siege_soc + " | " + juridique;
//this.emit(output);
console.log(output)
} catch(err) {
console.log(err);
}
});
}
}
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment