Skip to content

Instantly share code, notes, and snippets.

@guillim
Last active February 3, 2023 16:16
Show Gist options
  • Save guillim/cf0de926708746f3d08177cdfeca4b0a to your computer and use it in GitHub Desktop.
Save guillim/cf0de926708746f3d08177cdfeca4b0a to your computer and use it in GitHub Desktop.
Boulanger #dgm #boulanger
//------------START------------------------------------------------------------------------------------------------
function start(context,$,_,site,typeDeCrawl){
site = (context.customData && context.customData.site) ? context.customData.site : site
typeDeCrawl = (context.customData && context.customData.typeDeCrawl) ? context.customData.typeDeCrawl : typeDeCrawl
switch (context.request.label) {
case 'home':
case_home(context,site,typeDeCrawl);
break;
case 'search':
case_search(context,$,_,site,typeDeCrawl);
break;
case 'product':
case_product(context,$,_,site,typeDeCrawl);
break;
}
}
//------------HOME------------------------------------------------------------------------------------------------
function case_home(context,site,typeDeCrawl){
var initialList = context.customData
if(context.customData && context.customData.initialList){
initialList = context.customData.initialList
}
if(typeDeCrawl && (typeDeCrawl === 'profond' || typeDeCrawl === 'simple')){
context.skipOutput();
initialList.split(',').map(function(keyword) {
enqueueSearch(context,keyword);
});
context.finish();
}else if(typeDeCrawl && typeDeCrawl === 'produit'){
context.skipOutput();
initialList.split(',').map(function(url) {
enqueueProductUrl(context,url);
});
context.finish();
}else{
context.finish({ debugInfo: 'case_home: bug in typeDeCrawl' });
}
}
//------------SEARCH----------------------------------------------------------------------------------------------
function case_search(context,$,_,site,typeDeCrawl){
var startedAt = Date.now();
interceptRequestData = context.request.interceptRequestData;
var h = function() {
var result = [];
var sponsored = 0;
var notAProductRow = 0;
var position = 0;
if( Date.now() - startedAt > 10000 ) { // timeout after 10 seconds
interceptRequestData.debugInfo = 'case_searchpagination: timeout after 10 seconds - is captcha true or false: ' + checkCaptcha($)
context.finish(interceptRequestData);
}
var productCountRaw = $(".infoListe span").text();
if($("div.product").length > 0){
$("div.product").map(function(i) {
var interceptRequestData = {};
interceptRequestData = _.clone(context.request.interceptRequestData);
interceptRequestData.c02_marketplaceName = 'boulanger.com';
if(productCountRaw){
var productCount = (productCountRaw.match(/[0-9, /\s]*article/g)) ? productCountRaw.match(/[0-9, /\s]*article/g)[0].replace(/[^0-9]/g,'') : 'NA' ;
interceptRequestData.c03_NumberofResults = parseInt(productCount);
}
interceptRequestData.c24_pageNumber = 1
var h2 = $(this).find('h2 a')
interceptRequestData.c06_itemURL = 'https://www.boulanger.com' + h2.attr('href');
position++
interceptRequestData.c07_position = _.clone(position)
interceptRequestData.c04_asin = (h2.attr('href')) ? h2.attr('href').replace(/[^0-9.]*/g,'') : 'NA'
interceptRequestData.c05_itemTitle = removeSpecCharacterIfExist(h2.text().trim());
interceptRequestData.c09_seller = 'Boulanger'
interceptRequestData.c14_priceRaw = removeSpecCharacterIfExist($(this).find('.priceBarre:eq(0)').text());
interceptRequestData.c13_priceLow = removeSpecCharacterIfExist($(this).find('.fix-price .exponent:eq(0)').text()) + removeSpecCharacterIfExist($(this).find('.fix-price sup:eq(0)').text());
var priceNumberLow = interceptRequestData.c13_priceLow.replace(/\./g,"").replace(/,/g,".").replace(/€/g, ".").replace(/\.$/g, ".");
var priceNumberHigh = interceptRequestData.c14_priceRaw.replace(/\./g,"").replace(/,/g,".").replace(/€/g, ".").replace(/\.$/g, ".");
interceptRequestData.c13_priceLow = (priceNumberLow) ? parseFloat(priceNumberLow.replace(/[^0-9.]*/g,'')) : 'NA';
interceptRequestData.c12_priceHigh = (priceNumberHigh) ? parseFloat(priceNumberHigh.replace(/[^0-9.]*/g,'')) : 'NA';
interceptRequestData.c15_currency = interceptRequestData.c14_priceRaw.replace(/[0-9., /\s]*/g,'');
interceptRequestData.c15_currency = currencyFormater(interceptRequestData.c15_currency);
interceptRequestData.c08_numberofcomments = $(this).find(".rating > span:contains('avis'):eq(0)").text().trim();
interceptRequestData.c08_numberofcomments = (interceptRequestData.c08_numberofcomments) ? interceptRequestData.c08_numberofcomments.replace(",","").replace(/[^0-9]/g,'') : 0;
interceptRequestData.c10_star = $(this).find(".rating").attr('class');
if(interceptRequestData.c10_star){
interceptRequestData.c10_star = interceptRequestData.c10_star.match(/star_[0-9]*/g)
interceptRequestData.c10_star = (interceptRequestData.c10_star && interceptRequestData.c10_star[0]) ? interceptRequestData.c10_star[0].replace(/[^0-9]/g,'') : false;
interceptRequestData.c10_star = (interceptRequestData.c10_star) ? parseInt(interceptRequestData.c10_star) / 10 : 'NA';
}
if(typeDeCrawl === 'profond'){
//on va clicker sur tous les produits inferieurs a 30 -> a reduir a 16 pour se conformer au contrat plus tard
if(interceptRequestData.c06_itemURL && interceptRequestData.c07_position && interceptRequestData.c07_position <= 15){
enqueueLabelUniqueKey(context,'product',interceptRequestData.c06_itemURL,interceptRequestData);
context.skipOutput();
}else{
console.log('typeDeCrawl === profond -------- c06_itemURL est pas defini -------- ou c07_position est superieur a 30 voir pas defini');
}
//}else if(typeDeCrawl === 'simple'){
}else{
if(interceptRequestData.c06_itemURL && interceptRequestData.c07_position && interceptRequestData.c07_position <= 15){
result.push(_.clone(interceptRequestData));
}else{
console.log('typeDeCrawl !== profond-------- c06_itemURL est pas defini -------- ou c07_position est superieur a 30 voir pas defini');
}
}
});
context.finish(result);
}else{
setTimeout(h, 2000);
}
};
setTimeout(h, 500);
}
//------------PRODUCT-----------------------------------------------------------------------------------
function case_product(context,$,_,site,typeDeCrawl){
var interceptRequestData = (context.request.interceptRequestData) ? context.request.interceptRequestData : {};
var startedAt = Date.now();
//context.willFinishLater();
// console.log('we try first to click on Avis clients')
// try{
// $("#caracteristiques li a:contains('Avis')").click()
// } catch(e) { console.error(e); }
var g = function() {
if( Date.now() - startedAt > 10000 ) { // timeout after 10 seconds
interceptRequestData.debugInfo = 'case_product: timeout after 10 seconds - check imageCount ? is captcha true or false:' + checkCaptcha($)
context.finish(interceptRequestData);
}
if($("h1").length){
var imageCountClassic = $('div[type=image]').length;
var imageCountAlternatif = $("#imageBlockThumbs img").length;
var imageCount = (imageCountClassic) ? imageCountClassic
: (imageCountAlternatif) ? imageCountAlternatif : 0
interceptRequestData.c18_nombreDImages = imageCount
interceptRequestData.c27_ImageURL = $("#pp_picture img:eq(0)").attr('src')
interceptRequestData.c16_descriptionProduit1 = removeSpecCharacterIfExist(trimIfExist($("h2:eq(0)").text()));
var asin = ( $("span[itemprop=productID]").text() ) ? $("span[itemprop=productID]").text() : 'NA';
interceptRequestData.c04_asin = (!interceptRequestData.c04_asin || interceptRequestData.c04_asin === '') ?
removeSpecCharacterIfExist(trimIfExist(asin)):
interceptRequestData.c04_asin;
interceptRequestData.c05_itemTitle = (!interceptRequestData.c05_itemTitle || interceptRequestData.c05_itemTitle === '') ?
removeSpecCharacterIfExist(trimIfExist($("h1").text())):
interceptRequestData.c05_itemTitle;
var numberofcomments = ( $('.ratingandRef span.link').text() ) ? $('.ratingandRef span.link').text().replace(/[^0-9]/g,'') : '';
interceptRequestData.c08_numberofcomments = (!interceptRequestData.c08_numberofcomments || interceptRequestData.c08_numberofcomments === '') ?
removeSpecCharacterIfExist(trimIfExist(numberofcomments)):
interceptRequestData.c08_numberofcomments;
interceptRequestData.c17_descriptionProduit2 = removeSpecCharacterIfExist(trimIfExist($(".best-points").text()));
interceptRequestData.c17_descriptionProduit2 = (typeof(interceptRequestData.c17_descriptionProduit2) === 'string') ? interceptRequestData.c17_descriptionProduit2.substr(0,1900) : interceptRequestData.c17_descriptionProduit2
var seller = 'Boulanger'
interceptRequestData.c09_seller = (!interceptRequestData.c09_seller || interceptRequestData.c09_seller === '') ?
removeSpecCharacterIfExist(trimIfExist(seller)):
interceptRequestData.c09_seller;
var star = 'NA'
try {
star = $(".rating.orange").attr('class')
star = (star) ? star.match(/star_[0-9]*/g)[0].replace('star_','') : false
star = (star) ? parseFloat(star) - 2 : false
star = (star && star > 0) ? star / 10 : 0
}
catch(e) { console.error(e); }
interceptRequestData.c10_star = (!interceptRequestData.c10_star || interceptRequestData.c10_star === '') ?
star:
interceptRequestData.c10_star;
var priceNumberHigh = removeSpecCharacterIfExist($('.priceBarre:eq(0)').text());
var priceNumberLow = removeSpecCharacterIfExist($('.fix-price .exponent:eq(0)').text()) + removeSpecCharacterIfExist($('.fix-price sup:eq(0)').text());
var c14_priceRaw = removeSpecCharacterIfExist(priceNumberLow);
priceNumberLow = priceNumberLow.replace(/\./g,"").replace(/,/g,".").replace(/€/g, ".").replace(/\.$/g, ".");
priceNumberHigh = priceNumberHigh.replace(/\./g,"").replace(/,/g,".").replace(/€/g, ".").replace(/\.$/g, ".");
var c13_priceLow = (priceNumberLow) ? parseFloat(priceNumberLow.replace(/[^0-9.]*/g,'')) : 'NA';
var c12_priceHigh = (priceNumberHigh) ? parseFloat(priceNumberHigh.replace(/[^0-9.]*/g,'')) : 'NA';
var c15_currency = c14_priceRaw.replace(/[0-9., /\s]*/g,'');
interceptRequestData.c15_currency = currencyFormater(c15_currency);
interceptRequestData.c12_priceHigh = (!interceptRequestData.c12_priceHigh || interceptRequestData.c12_priceHigh === '') ?
c12_priceHigh:
interceptRequestData.c12_priceHigh;
interceptRequestData.c13_priceLow = (!interceptRequestData.c13_priceLow || interceptRequestData.c13_priceLow === '') ?
c13_priceLow:
interceptRequestData.c13_priceLow;
interceptRequestData.c14_priceRaw = (!interceptRequestData.c14_priceRaw || interceptRequestData.c14_priceRaw === '') ?
c14_priceRaw:
interceptRequestData.c14_priceRaw;
interceptRequestData.c15_currency = (!interceptRequestData.c15_currency || interceptRequestData.c15_currency === '') ?
c15_currency:
interceptRequestData.c15_currency;
// var repartition = {} ;
// repartition.star5 = ($(".5star:contains('%')").text() ) ? $(".5star:contains('%')").text().match(/[0-9]*%/g)[0] : null;
// repartition.star4 = ($(".4star:contains('%')").text() ) ? $(".4star:contains('%')").text().match(/[0-9]*%/g)[0] : null;
// repartition.star3 = ($(".3star:contains('%')").text() ) ? $(".3star:contains('%')").text().match(/[0-9]*%/g)[0] : null;
// repartition.star2 = ($(".2star:contains('%')").text() ) ? $(".2star:contains('%')").text().match(/[0-9]*%/g)[0] : null;
// repartition.star1 = ($(".1star:contains('%')").text() ) ? $(".1star:contains('%')").text().match(/[0-9]*%/g)[0] : null;
// interceptRequestData.c11_repartition = nullIfnothingOrNARepartitionStar(repartition,_);
// interceptRequestData.c22_reviews = [];
// $(".BVRRContentReview").map(function() {
// console.log('---> here in $(".BVRRContentReview") ')
// var review = {};
// review.c06_title = $(this).find('span[itemprop=name]').text();
// review.c03_note = $(this).find('span[itemprop=ratingValue]').text().trim();
// review.c05_texte = $(this).find('span[itemprop=description]').text().trim();
// // review.c04_type = removeSpecCharacterIfExist(trimIfExist($(this).find(".badges-genome-widget").text()));
// review.c02_helpfulReview = removeSpecCharacterIfExist(trimIfExist($(this).find(".BVDI_FVPositive .BVDINumber").text()));
// review.c02_helpfulReview = (review.c02_helpfulReview && review.c02_helpfulReview.match(/.*cela utile/g) ) ? review.c02_helpfulReview.match(/.*cela utile/g)[0] : 'NA' ;
// review.c07_verified = ($(this).find("a:contains('Achat vérifié')").length > 0) ? true : false;
// interceptRequestData.c22_reviews.push( review );
// });
context.finish(interceptRequestData);
}else{
setTimeout(g, 2000);
}
};
g();
}
//------------------------------------------------------------------------------------------------
//-----------------------------------function-----------------------------------------------------
//------------------------------------------------------------------------------------------------
function enqueueSearch(context,keyword) {
var searchUrl = 'https://www.boulanger.com/resultats?tr=';
var encodedKeyword = encodeURI( keyword.trim().replace(/(\s{1,})/g, '+') );
context.enqueuePage({
label: 'search',
url: searchUrl + encodedKeyword,
interceptRequestData: { c01_keyword: keyword }
});
}
function enqueueProductUrl(context,url) {
context.enqueuePage({
label: 'product',
url: url,
interceptRequestData: { c06_itemURL: url, c02_marketplaceName: 'boulanger.com' }
});
}
function enqueueLabel(context,label,url,interceptRequestData) {
context.enqueuePage({
label: label,
url: url,
interceptRequestData: interceptRequestData
});
}
function enqueueLabelUniqueKey(context,label,url,interceptRequestData) {
context.enqueuePage({
label: label,
url: url,
uniqueKey: url + interceptRequestData.itemURL + Math.floor(Math.random() * 1000000000000000),
interceptRequestData: interceptRequestData
// ,queuePosition: "LAST"
});
}
function replaceByfalseifneeded(text) {
return (text && text === '') ? false : text ;
}
function trimIfExist(text) {
return (text) ? replaceByfalseifneeded(text.trim()) : text ;
}
function removeSpecCharacterIfExist(text) {
return (text) ? text.replace(/(\s\s+|\\n)/gi, ' ') : text ;
}
function nullIfnothingOrNARepartitionStar(obj,_){
var obj2 = _.clone(obj);
var count = 0;
for(var property in obj) {
if(obj[property] === false || obj[property] === null || obj[property] === ''){
obj2[property] = "0%";
count++;
}
}
if(count === 5){ //si jamais on a 5 fois rien, alors on met 'not available'
for(var property in obj) { obj2[property] = 'NA'; }
}
return obj2;
}
function currencyFormater(text) {
var arr = text.split(/-/g);
return (arr.length > 1 && arr[0] === arr[1] ) ? arr[0] : text ;
}
//captcha alert
function checkCaptcha($){
if ( $('div:contains("make sure you\'re not a robot.")').length !== 0
|| $('div:contains("ne suis pas un robot")').length !== 0
|| $('div:contains("n\'êtes pas un robot")').length !== 0
|| $('div:contains("caractères que vous voyez")').length !== 0
|| $('div:contains("the characters you see")').length !== 0
|| $('div:contains("les caractères affichés")').length !== 0) {
return true
}else{
return false
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment