Skip to content

Instantly share code, notes, and snippets.

@harryandriyan
Created April 27, 2020 08:26
Show Gist options
  • Save harryandriyan/66bbb9f9e7a16bafec5b7c085d1ddc32 to your computer and use it in GitHub Desktop.
Save harryandriyan/66bbb9f9e7a16bafec5b7c085d1ddc32 to your computer and use it in GitHub Desktop.
Scrap Amazon Reviews
const puppeteer = require('puppeteer');
const cheerio = require('cheerio');
let scrape = async () => {
const browser = await puppeteer.launch({ headless: true });
const page = await browser.newPage();
await page.goto('https://www.amazon.com/product-reviews/B07T86F7FQ');
await page.waitForSelector('body');
const html = await page.evaluate(() => document.body.innerHTML);
const $ = cheerio.load(html);
const totalReview = Number($('#filter-info-section > span.a-size-base').text().split(' ')[3]);
var results = [];
var lastPageNumber = Math.ceil(totalReview / 10);
for (let index = 0; index < lastPageNumber; index++) {
await page.waitFor(2000);
results = results.concat(await extractedEvaluateCall(page));
if (index !== lastPageNumber - 1) {
await page.click('#cm_cr-pagination_bar > ul > li.a-last > a');
}
}
browser.close();
return lastPageNumber;
};
async function extractedEvaluateCall(page) {
// just extracted same exact logic in separate function
// this function should use async keyword in order to work and take page as argument
return page.evaluate(() => {
let data = [];
let elements = document.querySelectorAll('.aok-relative');
for (var element of elements) {
const baseElement = element.children[0].children[0].innerHTML;
const $ = cheerio.load(baseElement);
const title = $('.review-title-content > span').text();
const body_copy = $('.review-text-content > span').text();
const score = Number($('.review-rating > span.a-icon-alt').text().split(' ')[0]);
const date = $('.review-date').text().split(' on ')[1];;
const author = $('.a-profile-name').text();
const number_of_comment = Number($('.review-comment-total').text());
const is_has_media = $('.review-image-tile').length > 0;
const is_verified = $('span.a-size-mini.a-color-state.a-text-bold').length > 0;
const is_child = false;
const review = {
title,
body_copy,
score,
date,
author,
number_of_comment,
is_has_media,
is_verified,
is_child,
}
data.push(review);
}
return data;
});
}
scrape().then((value) => {
console.log(value);
console.log('Collection length: ' + value.length);
console.log(value[0]);
console.log(value[value.length - 1]);
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment