Created
April 27, 2020 08:26
-
-
Save harryandriyan/66bbb9f9e7a16bafec5b7c085d1ddc32 to your computer and use it in GitHub Desktop.
Scrap Amazon Reviews
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const puppeteer = require('puppeteer'); | |
const cheerio = require('cheerio'); | |
let scrape = async () => { | |
const browser = await puppeteer.launch({ headless: true }); | |
const page = await browser.newPage(); | |
await page.goto('https://www.amazon.com/product-reviews/B07T86F7FQ'); | |
await page.waitForSelector('body'); | |
const html = await page.evaluate(() => document.body.innerHTML); | |
const $ = cheerio.load(html); | |
const totalReview = Number($('#filter-info-section > span.a-size-base').text().split(' ')[3]); | |
var results = []; | |
var lastPageNumber = Math.ceil(totalReview / 10); | |
for (let index = 0; index < lastPageNumber; index++) { | |
await page.waitFor(2000); | |
results = results.concat(await extractedEvaluateCall(page)); | |
if (index !== lastPageNumber - 1) { | |
await page.click('#cm_cr-pagination_bar > ul > li.a-last > a'); | |
} | |
} | |
browser.close(); | |
return lastPageNumber; | |
}; | |
async function extractedEvaluateCall(page) { | |
// just extracted same exact logic in separate function | |
// this function should use async keyword in order to work and take page as argument | |
return page.evaluate(() => { | |
let data = []; | |
let elements = document.querySelectorAll('.aok-relative'); | |
for (var element of elements) { | |
const baseElement = element.children[0].children[0].innerHTML; | |
const $ = cheerio.load(baseElement); | |
const title = $('.review-title-content > span').text(); | |
const body_copy = $('.review-text-content > span').text(); | |
const score = Number($('.review-rating > span.a-icon-alt').text().split(' ')[0]); | |
const date = $('.review-date').text().split(' on ')[1];; | |
const author = $('.a-profile-name').text(); | |
const number_of_comment = Number($('.review-comment-total').text()); | |
const is_has_media = $('.review-image-tile').length > 0; | |
const is_verified = $('span.a-size-mini.a-color-state.a-text-bold').length > 0; | |
const is_child = false; | |
const review = { | |
title, | |
body_copy, | |
score, | |
date, | |
author, | |
number_of_comment, | |
is_has_media, | |
is_verified, | |
is_child, | |
} | |
data.push(review); | |
} | |
return data; | |
}); | |
} | |
scrape().then((value) => { | |
console.log(value); | |
console.log('Collection length: ' + value.length); | |
console.log(value[0]); | |
console.log(value[value.length - 1]); | |
}); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment