Skip to content

Instantly share code, notes, and snippets.

@sfnprg
Created May 10, 2020 12:50
Show Gist options
  • Save sfnprg/7d8ed5a705e661a961f74677627dac92 to your computer and use it in GitHub Desktop.
Save sfnprg/7d8ed5a705e661a961f74677627dac92 to your computer and use it in GitHub Desktop.
artoo.js - scraping codes
var scraper = {
iterator: 'article.bi-bloc.blocs.bi-pro',
data: {
name: {sel: 'a.denomination-links', attr: 'title'},
profile_url: {sel: 'a.denomination-links', attr: 'href', method: function($) {
var url = $(this).attr('href').replace("#", "");
if (url) {
url = "https://www.pagesjaunes.fr" + url;
}
return url;
}},
address: {sel: 'div.adresse-container', content: 'text', method: function($) {
return $(this).text().trim().replace(/[\(\)]/g, '').replace(/\n/g, " ");
}},
rating_avg: {sel: '.avis span.bi-note strong'},
rating_count: {sel: '.avis .contribution a span.nb_avis', method: function($) {
return $(this).text().trim();
}},
phone: {sel: 'li.bi-contact-numbers strong.num', attr: 'title', method: function($) {
var arr = [];
$.each( $(this), function() {
var data = $(this).text().trim().replace(/\s/g, "");
arr.push(data);
});
return arr;
}},
website: {sel: 'li.bi-site-internet a.pj-link', attr: 'href', method: function($) {
var arr = [];
$.each( $(this), function() {
var data = $(this).text().trim().replace(/\s/g, "");
arr.push(data);
});
return arr;
}},
description: {sel: '.description p.cviv.cris', content: 'text', method: function($) {
return $(this).text().trim().replace(/[\(\)]/g, '').replace(/\n/g, " ");;
}},
tags: {sel: '.activites-mentions a', content: 'text', method: function($) {
var tags = $(this).text().trim().replace(/\n/g, "").split(",");
tags = $.map(tags, $.trim);
return tags;
}},
image: {sel: 'a.visuel.photo img', attr: 'src'}
}
};
function nextUrl($page) {
return $page.find('a.link_pagination.next').attr('href');
}
artoo.log.debug('Starting the scraper...');
var frontpage = artoo.scrape(scraper);
artoo.ajaxSpider(
function(i, $data) {
return nextUrl(!i ? artoo.$(document) : $data);
},
{
limit: 2,
scrape: scraper,
concat: true,
done: function(data) {
artoo.log.debug('Finished retrieving data. Downloading...');
artoo.savePrettyJson(
frontpage.concat(data),
{filename: 'pagesjaunes.json'}
);
}
}
);
artoo.scrape('article.bi-bloc.blocs.bi-pro', {
name: {sel: 'a.denomination-links', attr: 'title'},
profile_url: {sel: 'a.denomination-links', attr: 'href', method: function($) {
var url = $(this).attr('href').replace("#", "");
if (url) {
url = "https://www.pagesjaunes.fr" + url;
}
return url;
}},
address: {sel: 'div.adresse-container', content: 'text', method: function($) {
return $(this).text().trim().replace(/[\(\)]/g, '').replace(/\n/g, " ");
}},
rating_avg: {sel: '.avis span.bi-note strong'},
rating_count: {sel: '.avis .contribution a span.nb_avis', method: function($) {
return $(this).text().trim();
}},
phone: {sel: 'li.bi-contact-numbers strong.num', attr: 'title', method: function($) {
var arr = [];
$.each( $(this), function() {
var data = $(this).text().trim().replace(/\s/g, "");
arr.push(data);
});
return arr;
}},
website: {sel: 'li.bi-site-internet a.pj-link', attr: 'href', method: function($) {
var arr = [];
$.each( $(this), function() {
var data = $(this).text().trim().replace(/\s/g, "");
arr.push(data);
});
return arr;
}},
description: {sel: '.description p.cviv.cris', content: 'text', method: function($) {
return $(this).text().trim().replace(/[\(\)]/g, '').replace(/\n/g, " ");;
}},
tags: {sel: '.activites-mentions a', content: 'text', method: function($) {
var tags = $(this).text().trim().replace(/\n/g, "").split(",");
tags = $.map(tags, $.trim);
return tags;
}},
image: {sel: 'a.visuel.photo img', attr: 'src'}
}, artoo.savePrettyJson);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment