Last active
June 4, 2024 09:23
-
-
Save guillim/38f85ee1412b594a231e99ef8a8eb405 to your computer and use it in GitHub Desktop.
#cdiscount #dgm
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
//------------START------------------------------------------------------------------------------------------------ | |
async function start(context,$,site,typeDeCrawl){ | |
context.log.info('remote file => start') | |
site = (context.customData && context.customData.site) ? context.customData.site : site | |
typeDeCrawl = (context.customData && context.customData.typeDeCrawl) ? context.customData.typeDeCrawl : typeDeCrawl | |
switch (context.request.userData.label) { | |
case 'home': | |
return await case_home(context,site,typeDeCrawl); | |
case 'search': | |
return await case_search(context,$,site,typeDeCrawl); | |
case 'product': | |
return await case_product(context,$,site,typeDeCrawl); | |
} | |
} | |
//------------HOME------------------------------------------------------------------------------------------------ | |
async function case_home(context,site,typeDeCrawl){ | |
var initialList = (context.customData && context.customData.initialList) ? context.customData.initialList : '' | |
if (initialList === '') { return { debugInfo: 'case_home: initialList empty' }; } | |
switch (typeDeCrawl) { | |
case 'simple': | |
case 'profond': | |
initialList.split(',').map(function(keyword) { | |
var encodedKeyword = encodeURI( keyword.trim().replace(/(\s{1,})/g, '+') ); | |
var searchUrl = 'https://www.cdiscount.com/search/10/'+encodedKeyword+'.html'; | |
enqueueLabel(context,'search',searchUrl,{ c01_keyword: keyword }) | |
}); | |
return undefined | |
case 'produit': | |
initialList.split(',').map(function(url) { | |
enqueueLabel(context,'product',url,{ p03_marketplaceName: site }) | |
}); | |
return undefined | |
default: | |
return { debugInfo: 'case_home: bug in typeDeCrawl' }; | |
} | |
} | |
//------------SEARCH---------------------------------------------------------------------------------------------- | |
async function case_search(context,$,site,typeDeCrawl){ | |
obj = context.request.userData.interceptRequestData; | |
var result = []; | |
var sponsored = 0; | |
var notAProductRow = 0; | |
await context.waitFor(() => !!$(".lpMain .jsPrdBlocContainer form"), { timeoutMillis: 10000 }); | |
var productCountRaw = $(".c-heading__title > span").text(); | |
var position = 0; | |
$(".lpMain .jsPrdBlocContainer form").map(function(i) { | |
var obj = {}; | |
obj = $.extend({}, context.request.userData.interceptRequestData); | |
obj.c02_marketplaceName = site; | |
obj.c03_NumberofResults = parseInt( productCountRaw.replace(/[^0-9]/g,'') ); | |
obj.c06_itemURL = $(this).find('.prdtBILDetails a:eq(0)').attr('href'); | |
obj.c04_asin = reg(obj.c06_itemURL) | |
if(obj.c04_asin.length === 0){ | |
notAProductRow++; | |
return false; | |
} | |
obj.c26_sponsoredBrand = false; | |
if($(this).find(".c-sponsoredMentions").length > 0){ | |
sponsored++; | |
obj.c23_sponsoredProduct = true; | |
obj.c07_position = sponsored; | |
}else{ | |
position++; | |
obj.c23_sponsoredProduct = false; | |
obj.c07_position = position - notAProductRow; | |
if (obj.c07_position < 0) { | |
obj.errorInfo = 'BUG => obj.c07_position =' + obj.c07_position + ' & obj.c01_keyword = ' +obj.c01_keyword + ' & i=' + i + '& position='+position | |
} | |
} | |
obj.c05_itemTitle = $(this).find('.prdtBILA').text().trim(); | |
obj.c14_priceRaw = $(this).find('.prdtBILPrice .price:eq(0)').text().trim(); | |
var nbrOfCom = $(this).find(".prdtBILStar").text().trim(); | |
obj.c08_numberofcomments = (nbrOfCom) ? tr(nbrOfCom.replace(",","").replace(/[^0-9]/g,'')) : 0 | |
// sur-charging the result | |
obj = addBooleansCdiscount($,this,obj) | |
if(obj.c06_itemURL && obj.c07_position){ | |
if(obj.c07_position <= 15){ | |
if(typeDeCrawl === 'profond'){ | |
// here we don't use c06_itemURL because we want to remove duplicates url ending with ?param=blabla | |
enqueueLabel(context,'product',obj.c06_itemURL,{ p03_marketplaceName: site }); | |
return undefined | |
}else{ | |
result.push($.extend({}, obj)); | |
} | |
} | |
}else{ | |
console.log('===> BUG : (crawl profond) c06_itemURL undefined OR c07_position undefined'); | |
} | |
}); | |
//here we try to get the Headlines Products | |
$(".skwOffer").map(function(i) { | |
var obj = {}; | |
obj = $.extend({}, context.request.userData.interceptRequestData); | |
obj.c02_marketplaceName = site; | |
// obj.c07_position = false; | |
obj.c23_sponsoredProduct = false; | |
obj.c26_sponsoredBrand = true; | |
obj.c06_itemURL = $(this).find('a[href]:eq(0)').attr('href'); | |
obj.c07_position = i + 1 | |
obj.c04_asin = reg(obj.c06_itemURL) | |
obj.c05_itemTitle = $(this).find('.skwOfferTitle').text().trim(); | |
obj.c08_numberofcomments = $(this).find(".skwRateContent").text().trim(); | |
obj.c08_numberofcomments = (obj.c08_numberofcomments) ? tr(obj.c08_numberofcomments.replace(",","").replace(/[\(\)]/g,"")) : 0; | |
// sur-charging the result | |
obj = addBooleansCdiscount($,this,obj) | |
if(obj.c07_position <= 15){ | |
result.push($.extend({}, obj)); | |
} | |
}); | |
//end of headline Search | |
return await result; | |
} | |
//------------PRODUCT----------------------------------------------------------------------------------- | |
async function case_product(context,$,site,typeDeCrawl){ | |
var obj = (context.request.userData.interceptRequestData) ? context.request.userData.interceptRequestData : {}; | |
var startedAt = Date.now(); | |
var g = function() { | |
if( Date.now() - startedAt > 10000 ) { // timeout after 10 seconds | |
obj.debugInfo = 'case_product: timeout after 10 seconds - check h1#title ? or is captcha true or false:' + checkCaptcha($) | |
return obj; | |
} | |
if($("h1").length){ | |
obj.p16_ImageURL = $(".fpImg img:eq(0)").attr('src'); | |
obj.p10_sellerTechnicalBrand = $("#ProductSheetAccordion table tr:contains('Marque')").text().replace(/\s/g,'').replace('Marque',''); | |
obj.p08_description1 = tr($(".c-productHighlights p").text()); | |
obj.p08_description1 = (typeof(obj.p08_description1) === 'string') ? obj.p08_description1.substr(0,1900) + '...' : obj.p08_description1 | |
obj.p04_code = reg(context.request.url) | |
obj.p05_itemTitle = tr($("h1").text()) | |
obj.p06_numberofcomments = Number( tr( $(".c-stars-rating__label:contains('avis'):eq(0)").text().replace(/[^0-9]/g,'') ) ) | |
// var p09_description2 = $(".c-productHighlights p.read-more") | |
// p09_description2.find("script").remove() | |
// obj.p09_description2 = tr(p09_description2.text()); | |
// obj.p09_description2 = (typeof(obj.p09_description2) === 'string') ? obj.p09_description2.substr(0,1900) + '...' : obj.p09_description2 | |
obj.p09_description2 = undefined | |
obj.p02_sellerOfficial = tr($(".c-sellerBy a").text()) | |
if(!obj.p02_sellerOfficial && $(".outOfStock").length > 0){ obj.p02_sellerOfficial = 'out of stock' } | |
var cdiscountavolonte = $("#fpSellBy").filter(function() { return /Cdiscount\sà\svolonté/g.test($(this).text());}).text(); | |
if(!obj.p02_sellerOfficial && cdiscountavolonte.length > 0){ obj.p02_sellerOfficial = 'Cdiscount à volonté' } | |
if(!obj.p02_sellerOfficial){ obj.p02_sellerOfficial = tr($("#fpSellBy").text()) } | |
var star = 'NA' | |
try { | |
star = tr( $("span.c-stars-rating__note:eq(0)").text().replace('/ 5','').replace(',','.')); | |
star = (star) ? star : 'NA' | |
} | |
catch(e) { console.error(e); } | |
obj.p07_star = Number( tr(star).replace(',','.') ) | |
obj.p14_reviews = []; | |
var h = function() { | |
$(".infoCli").map(function() { | |
var review = {}; | |
review.p05_title = tr($(this).find('.title').text()) | |
review.p02_note = getStarFromClasses($(this)) | |
review.p04_texte = tr($(this).find('> p').text()) | |
review.p04_texte = (typeof(review.p04_texte) === 'string') ? review.p04_texte.substr(0,2900) + '...' : review.p04_texte | |
// ex de p07_other: "1000 PREMIERS REDACTEURS D'AVIS" | |
// review.p07_other = tr($(this).find(".badges-genome-widget").text()); | |
var helpf = tr($(this).find(".jsYesRat:contains('Oui') span").text()); | |
review.p01_helpfulReview = (helpf && helpf.match(/\([0-9]*\)/g) ) ? helpf.replace(/[\(\)]/g,'') : 'NA' ; | |
review.p06_verified = ($(this).find(".achatCert").length > 0) ? true : false; | |
obj.p14_reviews.push( review ); | |
}); | |
} | |
h(); | |
// obj.p01_seller = | |
obj.p15_QuestionReponses = []; | |
var h2 = function() { | |
$(this).find(".fpFAQQuestion").map(function() { | |
var qa = {}; | |
qa.p01_question = tr($(this).find('div:eq(0) .fpFAQQuestionText').text()); | |
qa.p02_reponse = tr($(this).find('div:eq(0) .fpAnswerContent p:eq(0)').text()); | |
// qa.p03_votes = Number( $(this).find('.a-col-left .vote .count').text().trim() ); | |
obj.p15_QuestionReponses.push( qa ); | |
}); | |
} | |
h2() | |
return obj; | |
}else{ | |
setTimeout(g, 1000); | |
} | |
}; | |
return await g(); | |
} | |
//-----------------------------------function----------------------------------------------------- | |
function enqueueLabel(context,label,url,interceptRequestData) { | |
context.enqueueRequest({ | |
userData : { | |
label:label, | |
interceptRequestData:interceptRequestData | |
}, | |
url: url | |
}); | |
} | |
function tr(text) { | |
return (typeof text === 'string') ? text.replace(/(\s\s+|\\n)/gi, ' ').trim() : text ; | |
} | |
//captcha alert | |
function checkCaptcha($){ | |
if ( $('div:contains("make sure you\'re not a robot.")').length !== 0 | |
|| $('div:contains("ne suis pas un robot")').length !== 0 | |
|| $('div:contains("n\'êtes pas un robot")').length !== 0 | |
|| $('div:contains("caractères que vous voyez")').length !== 0 | |
|| $('div:contains("the characters you see")').length !== 0 | |
|| $('div:contains("les caractères affichés")').length !== 0) { | |
return true | |
}else{ | |
return false | |
} | |
} | |
function addBooleansCdiscount($,thisObject,interceptRequestData){ | |
// interceptRequestData.c28_isPantry = ($(thisObject).find("i.s-eu-icon-amazon-pantry").length ) ? true : false | |
// interceptRequestData.c29_isPrime = ($(thisObject).find("i.a-icon-prime").length ) ? true : false | |
// interceptRequestData.c30_isNumberOneSeller = ( $(thisObject).find("[aria-labelledby*='best-seller']").length ) ? true : false | |
interceptRequestData.c31_isFirstChoice = ($(thisObject).find(".prdtBILLabel").length ) ? true : false | |
interceptRequestData.c32_isCouponAvailable = ($(thisObject).find(".prdtBILSpecial > div").length ) ? true : false | |
return interceptRequestData | |
} | |
function getStarFromClasses(jQueryElement){ | |
var star = false | |
for( var i = 1; i <= 5; i++){ | |
if( jQueryElement.find('.ratingPosition').hasClass('stN'+i) ){ star = i; } | |
} | |
return star | |
} | |
function reg(url) { | |
var re = url.match(/[^\/]*.html|\??idOffre=[^&#]*/g); | |
return (re) ? re.join('') : ''; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment