Last active
July 25, 2023 18:33
-
-
Save jfjensen/82b480711269065dd4a0ebdf8b665c4f to your computer and use it in GitHub Desktop.
A Python script for scraping reviews from Amazon given one or more product ASIN codes.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from playwright.sync_api import sync_playwright | |
from selectolax.parser import HTMLParser | |
from dataclasses import dataclass | |
import time | |
import csv | |
from loguru import logger | |
@dataclass | |
class Item: | |
asin: str | |
product: str | |
title: str | |
rating: str | |
body: str | |
class Reviews(object): | |
def __init__(self): | |
self.parser = HTMLParser | |
self.pw = sync_playwright().start() | |
self.browser = self.pw.chromium.launch() | |
logger.debug("Reviews object initialized") | |
def __enter__(self): | |
logger.debug("Reviews object entered") | |
return self | |
def __exit__(self, xc_type, exc_value, traceback): | |
self.browser.close() | |
self.pw.stop() | |
logger.debug("Reviews object exited") | |
def __get_html(self, page, asin, nr): | |
url = f"https://www.amazon.com/product-reviews/{asin}/ref=cm_cr_arp_d_viewopt_srt?reviewerType=avp_only_reviews&sortBy=recent&pageNumber=" | |
page.goto(url + str(nr)) | |
html = self.parser(page.content()) | |
rv = html.css_matches("div[data-hook=review]") | |
if rv: | |
logger.debug(f"Getting the HTML review page nr. {nr} for ASIN: {asin}") | |
return html | |
else: | |
logger.debug(f"No more HTML review pages for ASIN: {asin}") | |
return False | |
def __parse_html(self, html, asin): | |
logger.debug(f"Parsing an HTML review page for ASIN: {asin}") | |
product = html.css_first("a[data-hook=product-link]").text(strip=True) | |
reviews = html.css("div[data-hook=review]") | |
items = [] | |
for r in reviews: | |
title = r.css_first("a[data-hook=review-title]").css_first("span:nth-child(3)").text(strip=True) | |
rating = r.css_first("i[data-hook=review-star-rating] span").text(strip=True)[:3] | |
body = r.css_first("span[data-hook=review-body] span").text(strip=True).replace("\n","") | |
item = Item(asin=asin, product=product, title=title, rating=rating, body=body) | |
items.append(item) | |
return items | |
def read_csv(self, file_path): | |
logger.debug(f"Reading ASINs from csv file: {file_path}") | |
with open(file_path, "r") as f: | |
reader = csv.reader(f) | |
return [item[0] for item in reader] | |
def to_csv(self, reviews, file_path): | |
logger.debug(f"Writing reviews to csv file: {file_path}") | |
with open(file_path, "w", newline='', encoding="utf-8") as f: | |
writer = csv.writer(f) | |
for r in reviews: | |
row = (r.asin, r.product, r.title, r.rating, r.body) | |
writer.writerow(row) | |
def run_single(self, asin): | |
page = self.browser.new_page() | |
logger.debug(f"Getting the all the available Amazon reviews for ASIN: {asin}") | |
reviews_list = self.__run(page, asin) | |
return reviews_list | |
def run_multiple(self, asin_list): | |
page = self.browser.new_page() | |
all_products_reviews = [] | |
for asin in asin_list: | |
logger.debug(f"Getting the all the available Amazon reviews for ASIN: {asin}") | |
reviews_list = self.__run(page, asin) | |
all_products_reviews = all_products_reviews + reviews_list | |
time.sleep(5) | |
return all_products_reviews | |
def __run(self, page, asin): | |
reviews_found = True | |
reviews_list = [] | |
ix = 1 | |
while reviews_found: | |
html = self.__get_html(page, asin, nr=ix) | |
ix += 1 | |
if html: | |
reviews = self.__parse_html(html, asin) | |
reviews_list = reviews_list + reviews | |
else: | |
reviews_found = False | |
break | |
return reviews_list | |
def main(): | |
with Reviews() as rv: | |
asin = "B01N5IB20Q" | |
result = rv.run_single(asin) | |
rv.to_csv(result, "result_single.csv") | |
file_path = "products.csv" | |
asins = rv.read_csv(file_path) | |
result = rv.run_multiple(asins) | |
rv.to_csv(result, "result_multiple.csv") | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment