Last active
February 3, 2023 16:16
-
-
Save guillim/cf0de926708746f3d08177cdfeca4b0a to your computer and use it in GitHub Desktop.
Boulanger #dgm #boulanger
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
//------------START------------------------------------------------------------------------------------------------ | |
function start(context,$,_,site,typeDeCrawl){ | |
site = (context.customData && context.customData.site) ? context.customData.site : site | |
typeDeCrawl = (context.customData && context.customData.typeDeCrawl) ? context.customData.typeDeCrawl : typeDeCrawl | |
switch (context.request.label) { | |
case 'home': | |
case_home(context,site,typeDeCrawl); | |
break; | |
case 'search': | |
case_search(context,$,_,site,typeDeCrawl); | |
break; | |
case 'product': | |
case_product(context,$,_,site,typeDeCrawl); | |
break; | |
} | |
} | |
//------------HOME------------------------------------------------------------------------------------------------ | |
function case_home(context,site,typeDeCrawl){ | |
var initialList = context.customData | |
if(context.customData && context.customData.initialList){ | |
initialList = context.customData.initialList | |
} | |
if(typeDeCrawl && (typeDeCrawl === 'profond' || typeDeCrawl === 'simple')){ | |
context.skipOutput(); | |
initialList.split(',').map(function(keyword) { | |
enqueueSearch(context,keyword); | |
}); | |
context.finish(); | |
}else if(typeDeCrawl && typeDeCrawl === 'produit'){ | |
context.skipOutput(); | |
initialList.split(',').map(function(url) { | |
enqueueProductUrl(context,url); | |
}); | |
context.finish(); | |
}else{ | |
context.finish({ debugInfo: 'case_home: bug in typeDeCrawl' }); | |
} | |
} | |
//------------SEARCH---------------------------------------------------------------------------------------------- | |
function case_search(context,$,_,site,typeDeCrawl){ | |
var startedAt = Date.now(); | |
interceptRequestData = context.request.interceptRequestData; | |
var h = function() { | |
var result = []; | |
var sponsored = 0; | |
var notAProductRow = 0; | |
var position = 0; | |
if( Date.now() - startedAt > 10000 ) { // timeout after 10 seconds | |
interceptRequestData.debugInfo = 'case_searchpagination: timeout after 10 seconds - is captcha true or false: ' + checkCaptcha($) | |
context.finish(interceptRequestData); | |
} | |
var productCountRaw = $(".infoListe span").text(); | |
if($("div.product").length > 0){ | |
$("div.product").map(function(i) { | |
var interceptRequestData = {}; | |
interceptRequestData = _.clone(context.request.interceptRequestData); | |
interceptRequestData.c02_marketplaceName = 'boulanger.com'; | |
if(productCountRaw){ | |
var productCount = (productCountRaw.match(/[0-9, /\s]*article/g)) ? productCountRaw.match(/[0-9, /\s]*article/g)[0].replace(/[^0-9]/g,'') : 'NA' ; | |
interceptRequestData.c03_NumberofResults = parseInt(productCount); | |
} | |
interceptRequestData.c24_pageNumber = 1 | |
var h2 = $(this).find('h2 a') | |
interceptRequestData.c06_itemURL = 'https://www.boulanger.com' + h2.attr('href'); | |
position++ | |
interceptRequestData.c07_position = _.clone(position) | |
interceptRequestData.c04_asin = (h2.attr('href')) ? h2.attr('href').replace(/[^0-9.]*/g,'') : 'NA' | |
interceptRequestData.c05_itemTitle = removeSpecCharacterIfExist(h2.text().trim()); | |
interceptRequestData.c09_seller = 'Boulanger' | |
interceptRequestData.c14_priceRaw = removeSpecCharacterIfExist($(this).find('.priceBarre:eq(0)').text()); | |
interceptRequestData.c13_priceLow = removeSpecCharacterIfExist($(this).find('.fix-price .exponent:eq(0)').text()) + removeSpecCharacterIfExist($(this).find('.fix-price sup:eq(0)').text()); | |
var priceNumberLow = interceptRequestData.c13_priceLow.replace(/\./g,"").replace(/,/g,".").replace(/€/g, ".").replace(/\.$/g, "."); | |
var priceNumberHigh = interceptRequestData.c14_priceRaw.replace(/\./g,"").replace(/,/g,".").replace(/€/g, ".").replace(/\.$/g, "."); | |
interceptRequestData.c13_priceLow = (priceNumberLow) ? parseFloat(priceNumberLow.replace(/[^0-9.]*/g,'')) : 'NA'; | |
interceptRequestData.c12_priceHigh = (priceNumberHigh) ? parseFloat(priceNumberHigh.replace(/[^0-9.]*/g,'')) : 'NA'; | |
interceptRequestData.c15_currency = interceptRequestData.c14_priceRaw.replace(/[0-9., /\s]*/g,''); | |
interceptRequestData.c15_currency = currencyFormater(interceptRequestData.c15_currency); | |
interceptRequestData.c08_numberofcomments = $(this).find(".rating > span:contains('avis'):eq(0)").text().trim(); | |
interceptRequestData.c08_numberofcomments = (interceptRequestData.c08_numberofcomments) ? interceptRequestData.c08_numberofcomments.replace(",","").replace(/[^0-9]/g,'') : 0; | |
interceptRequestData.c10_star = $(this).find(".rating").attr('class'); | |
if(interceptRequestData.c10_star){ | |
interceptRequestData.c10_star = interceptRequestData.c10_star.match(/star_[0-9]*/g) | |
interceptRequestData.c10_star = (interceptRequestData.c10_star && interceptRequestData.c10_star[0]) ? interceptRequestData.c10_star[0].replace(/[^0-9]/g,'') : false; | |
interceptRequestData.c10_star = (interceptRequestData.c10_star) ? parseInt(interceptRequestData.c10_star) / 10 : 'NA'; | |
} | |
if(typeDeCrawl === 'profond'){ | |
//on va clicker sur tous les produits inferieurs a 30 -> a reduir a 16 pour se conformer au contrat plus tard | |
if(interceptRequestData.c06_itemURL && interceptRequestData.c07_position && interceptRequestData.c07_position <= 15){ | |
enqueueLabelUniqueKey(context,'product',interceptRequestData.c06_itemURL,interceptRequestData); | |
context.skipOutput(); | |
}else{ | |
console.log('typeDeCrawl === profond -------- c06_itemURL est pas defini -------- ou c07_position est superieur a 30 voir pas defini'); | |
} | |
//}else if(typeDeCrawl === 'simple'){ | |
}else{ | |
if(interceptRequestData.c06_itemURL && interceptRequestData.c07_position && interceptRequestData.c07_position <= 15){ | |
result.push(_.clone(interceptRequestData)); | |
}else{ | |
console.log('typeDeCrawl !== profond-------- c06_itemURL est pas defini -------- ou c07_position est superieur a 30 voir pas defini'); | |
} | |
} | |
}); | |
context.finish(result); | |
}else{ | |
setTimeout(h, 2000); | |
} | |
}; | |
setTimeout(h, 500); | |
} | |
//------------PRODUCT----------------------------------------------------------------------------------- | |
function case_product(context,$,_,site,typeDeCrawl){ | |
var interceptRequestData = (context.request.interceptRequestData) ? context.request.interceptRequestData : {}; | |
var startedAt = Date.now(); | |
//context.willFinishLater(); | |
// console.log('we try first to click on Avis clients') | |
// try{ | |
// $("#caracteristiques li a:contains('Avis')").click() | |
// } catch(e) { console.error(e); } | |
var g = function() { | |
if( Date.now() - startedAt > 10000 ) { // timeout after 10 seconds | |
interceptRequestData.debugInfo = 'case_product: timeout after 10 seconds - check imageCount ? is captcha true or false:' + checkCaptcha($) | |
context.finish(interceptRequestData); | |
} | |
if($("h1").length){ | |
var imageCountClassic = $('div[type=image]').length; | |
var imageCountAlternatif = $("#imageBlockThumbs img").length; | |
var imageCount = (imageCountClassic) ? imageCountClassic | |
: (imageCountAlternatif) ? imageCountAlternatif : 0 | |
interceptRequestData.c18_nombreDImages = imageCount | |
interceptRequestData.c27_ImageURL = $("#pp_picture img:eq(0)").attr('src') | |
interceptRequestData.c16_descriptionProduit1 = removeSpecCharacterIfExist(trimIfExist($("h2:eq(0)").text())); | |
var asin = ( $("span[itemprop=productID]").text() ) ? $("span[itemprop=productID]").text() : 'NA'; | |
interceptRequestData.c04_asin = (!interceptRequestData.c04_asin || interceptRequestData.c04_asin === '') ? | |
removeSpecCharacterIfExist(trimIfExist(asin)): | |
interceptRequestData.c04_asin; | |
interceptRequestData.c05_itemTitle = (!interceptRequestData.c05_itemTitle || interceptRequestData.c05_itemTitle === '') ? | |
removeSpecCharacterIfExist(trimIfExist($("h1").text())): | |
interceptRequestData.c05_itemTitle; | |
var numberofcomments = ( $('.ratingandRef span.link').text() ) ? $('.ratingandRef span.link').text().replace(/[^0-9]/g,'') : ''; | |
interceptRequestData.c08_numberofcomments = (!interceptRequestData.c08_numberofcomments || interceptRequestData.c08_numberofcomments === '') ? | |
removeSpecCharacterIfExist(trimIfExist(numberofcomments)): | |
interceptRequestData.c08_numberofcomments; | |
interceptRequestData.c17_descriptionProduit2 = removeSpecCharacterIfExist(trimIfExist($(".best-points").text())); | |
interceptRequestData.c17_descriptionProduit2 = (typeof(interceptRequestData.c17_descriptionProduit2) === 'string') ? interceptRequestData.c17_descriptionProduit2.substr(0,1900) : interceptRequestData.c17_descriptionProduit2 | |
var seller = 'Boulanger' | |
interceptRequestData.c09_seller = (!interceptRequestData.c09_seller || interceptRequestData.c09_seller === '') ? | |
removeSpecCharacterIfExist(trimIfExist(seller)): | |
interceptRequestData.c09_seller; | |
var star = 'NA' | |
try { | |
star = $(".rating.orange").attr('class') | |
star = (star) ? star.match(/star_[0-9]*/g)[0].replace('star_','') : false | |
star = (star) ? parseFloat(star) - 2 : false | |
star = (star && star > 0) ? star / 10 : 0 | |
} | |
catch(e) { console.error(e); } | |
interceptRequestData.c10_star = (!interceptRequestData.c10_star || interceptRequestData.c10_star === '') ? | |
star: | |
interceptRequestData.c10_star; | |
var priceNumberHigh = removeSpecCharacterIfExist($('.priceBarre:eq(0)').text()); | |
var priceNumberLow = removeSpecCharacterIfExist($('.fix-price .exponent:eq(0)').text()) + removeSpecCharacterIfExist($('.fix-price sup:eq(0)').text()); | |
var c14_priceRaw = removeSpecCharacterIfExist(priceNumberLow); | |
priceNumberLow = priceNumberLow.replace(/\./g,"").replace(/,/g,".").replace(/€/g, ".").replace(/\.$/g, "."); | |
priceNumberHigh = priceNumberHigh.replace(/\./g,"").replace(/,/g,".").replace(/€/g, ".").replace(/\.$/g, "."); | |
var c13_priceLow = (priceNumberLow) ? parseFloat(priceNumberLow.replace(/[^0-9.]*/g,'')) : 'NA'; | |
var c12_priceHigh = (priceNumberHigh) ? parseFloat(priceNumberHigh.replace(/[^0-9.]*/g,'')) : 'NA'; | |
var c15_currency = c14_priceRaw.replace(/[0-9., /\s]*/g,''); | |
interceptRequestData.c15_currency = currencyFormater(c15_currency); | |
interceptRequestData.c12_priceHigh = (!interceptRequestData.c12_priceHigh || interceptRequestData.c12_priceHigh === '') ? | |
c12_priceHigh: | |
interceptRequestData.c12_priceHigh; | |
interceptRequestData.c13_priceLow = (!interceptRequestData.c13_priceLow || interceptRequestData.c13_priceLow === '') ? | |
c13_priceLow: | |
interceptRequestData.c13_priceLow; | |
interceptRequestData.c14_priceRaw = (!interceptRequestData.c14_priceRaw || interceptRequestData.c14_priceRaw === '') ? | |
c14_priceRaw: | |
interceptRequestData.c14_priceRaw; | |
interceptRequestData.c15_currency = (!interceptRequestData.c15_currency || interceptRequestData.c15_currency === '') ? | |
c15_currency: | |
interceptRequestData.c15_currency; | |
// var repartition = {} ; | |
// repartition.star5 = ($(".5star:contains('%')").text() ) ? $(".5star:contains('%')").text().match(/[0-9]*%/g)[0] : null; | |
// repartition.star4 = ($(".4star:contains('%')").text() ) ? $(".4star:contains('%')").text().match(/[0-9]*%/g)[0] : null; | |
// repartition.star3 = ($(".3star:contains('%')").text() ) ? $(".3star:contains('%')").text().match(/[0-9]*%/g)[0] : null; | |
// repartition.star2 = ($(".2star:contains('%')").text() ) ? $(".2star:contains('%')").text().match(/[0-9]*%/g)[0] : null; | |
// repartition.star1 = ($(".1star:contains('%')").text() ) ? $(".1star:contains('%')").text().match(/[0-9]*%/g)[0] : null; | |
// interceptRequestData.c11_repartition = nullIfnothingOrNARepartitionStar(repartition,_); | |
// interceptRequestData.c22_reviews = []; | |
// $(".BVRRContentReview").map(function() { | |
// console.log('---> here in $(".BVRRContentReview") ') | |
// var review = {}; | |
// review.c06_title = $(this).find('span[itemprop=name]').text(); | |
// review.c03_note = $(this).find('span[itemprop=ratingValue]').text().trim(); | |
// review.c05_texte = $(this).find('span[itemprop=description]').text().trim(); | |
// // review.c04_type = removeSpecCharacterIfExist(trimIfExist($(this).find(".badges-genome-widget").text())); | |
// review.c02_helpfulReview = removeSpecCharacterIfExist(trimIfExist($(this).find(".BVDI_FVPositive .BVDINumber").text())); | |
// review.c02_helpfulReview = (review.c02_helpfulReview && review.c02_helpfulReview.match(/.*cela utile/g) ) ? review.c02_helpfulReview.match(/.*cela utile/g)[0] : 'NA' ; | |
// review.c07_verified = ($(this).find("a:contains('Achat vérifié')").length > 0) ? true : false; | |
// interceptRequestData.c22_reviews.push( review ); | |
// }); | |
context.finish(interceptRequestData); | |
}else{ | |
setTimeout(g, 2000); | |
} | |
}; | |
g(); | |
} | |
//------------------------------------------------------------------------------------------------ | |
//-----------------------------------function----------------------------------------------------- | |
//------------------------------------------------------------------------------------------------ | |
function enqueueSearch(context,keyword) { | |
var searchUrl = 'https://www.boulanger.com/resultats?tr='; | |
var encodedKeyword = encodeURI( keyword.trim().replace(/(\s{1,})/g, '+') ); | |
context.enqueuePage({ | |
label: 'search', | |
url: searchUrl + encodedKeyword, | |
interceptRequestData: { c01_keyword: keyword } | |
}); | |
} | |
function enqueueProductUrl(context,url) { | |
context.enqueuePage({ | |
label: 'product', | |
url: url, | |
interceptRequestData: { c06_itemURL: url, c02_marketplaceName: 'boulanger.com' } | |
}); | |
} | |
function enqueueLabel(context,label,url,interceptRequestData) { | |
context.enqueuePage({ | |
label: label, | |
url: url, | |
interceptRequestData: interceptRequestData | |
}); | |
} | |
function enqueueLabelUniqueKey(context,label,url,interceptRequestData) { | |
context.enqueuePage({ | |
label: label, | |
url: url, | |
uniqueKey: url + interceptRequestData.itemURL + Math.floor(Math.random() * 1000000000000000), | |
interceptRequestData: interceptRequestData | |
// ,queuePosition: "LAST" | |
}); | |
} | |
function replaceByfalseifneeded(text) { | |
return (text && text === '') ? false : text ; | |
} | |
function trimIfExist(text) { | |
return (text) ? replaceByfalseifneeded(text.trim()) : text ; | |
} | |
function removeSpecCharacterIfExist(text) { | |
return (text) ? text.replace(/(\s\s+|\\n)/gi, ' ') : text ; | |
} | |
function nullIfnothingOrNARepartitionStar(obj,_){ | |
var obj2 = _.clone(obj); | |
var count = 0; | |
for(var property in obj) { | |
if(obj[property] === false || obj[property] === null || obj[property] === ''){ | |
obj2[property] = "0%"; | |
count++; | |
} | |
} | |
if(count === 5){ //si jamais on a 5 fois rien, alors on met 'not available' | |
for(var property in obj) { obj2[property] = 'NA'; } | |
} | |
return obj2; | |
} | |
function currencyFormater(text) { | |
var arr = text.split(/-/g); | |
return (arr.length > 1 && arr[0] === arr[1] ) ? arr[0] : text ; | |
} | |
//captcha alert | |
function checkCaptcha($){ | |
if ( $('div:contains("make sure you\'re not a robot.")').length !== 0 | |
|| $('div:contains("ne suis pas un robot")').length !== 0 | |
|| $('div:contains("n\'êtes pas un robot")').length !== 0 | |
|| $('div:contains("caractères que vous voyez")').length !== 0 | |
|| $('div:contains("the characters you see")').length !== 0 | |
|| $('div:contains("les caractères affichés")').length !== 0) { | |
return true | |
}else{ | |
return false | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment