Skip to content

Instantly share code, notes, and snippets.

@jfjensen
Last active July 25, 2023 18:33
Show Gist options
  • Save jfjensen/82b480711269065dd4a0ebdf8b665c4f to your computer and use it in GitHub Desktop.
Save jfjensen/82b480711269065dd4a0ebdf8b665c4f to your computer and use it in GitHub Desktop.
A Python script for scraping reviews from Amazon given one or more product ASIN codes.
from playwright.sync_api import sync_playwright
from selectolax.parser import HTMLParser
from dataclasses import dataclass
import time
import csv
from loguru import logger
@dataclass
class Item:
asin: str
product: str
title: str
rating: str
body: str
class Reviews(object):
def __init__(self):
self.parser = HTMLParser
self.pw = sync_playwright().start()
self.browser = self.pw.chromium.launch()
logger.debug("Reviews object initialized")
def __enter__(self):
logger.debug("Reviews object entered")
return self
def __exit__(self, xc_type, exc_value, traceback):
self.browser.close()
self.pw.stop()
logger.debug("Reviews object exited")
def __get_html(self, page, asin, nr):
url = f"https://www.amazon.com/product-reviews/{asin}/ref=cm_cr_arp_d_viewopt_srt?reviewerType=avp_only_reviews&sortBy=recent&pageNumber="
page.goto(url + str(nr))
html = self.parser(page.content())
rv = html.css_matches("div[data-hook=review]")
if rv:
logger.debug(f"Getting the HTML review page nr. {nr} for ASIN: {asin}")
return html
else:
logger.debug(f"No more HTML review pages for ASIN: {asin}")
return False
def __parse_html(self, html, asin):
logger.debug(f"Parsing an HTML review page for ASIN: {asin}")
product = html.css_first("a[data-hook=product-link]").text(strip=True)
reviews = html.css("div[data-hook=review]")
items = []
for r in reviews:
title = r.css_first("a[data-hook=review-title]").css_first("span:nth-child(3)").text(strip=True)
rating = r.css_first("i[data-hook=review-star-rating] span").text(strip=True)[:3]
body = r.css_first("span[data-hook=review-body] span").text(strip=True).replace("\n","")
item = Item(asin=asin, product=product, title=title, rating=rating, body=body)
items.append(item)
return items
def read_csv(self, file_path):
logger.debug(f"Reading ASINs from csv file: {file_path}")
with open(file_path, "r") as f:
reader = csv.reader(f)
return [item[0] for item in reader]
def to_csv(self, reviews, file_path):
logger.debug(f"Writing reviews to csv file: {file_path}")
with open(file_path, "w", newline='', encoding="utf-8") as f:
writer = csv.writer(f)
for r in reviews:
row = (r.asin, r.product, r.title, r.rating, r.body)
writer.writerow(row)
def run_single(self, asin):
page = self.browser.new_page()
logger.debug(f"Getting the all the available Amazon reviews for ASIN: {asin}")
reviews_list = self.__run(page, asin)
return reviews_list
def run_multiple(self, asin_list):
page = self.browser.new_page()
all_products_reviews = []
for asin in asin_list:
logger.debug(f"Getting the all the available Amazon reviews for ASIN: {asin}")
reviews_list = self.__run(page, asin)
all_products_reviews = all_products_reviews + reviews_list
time.sleep(5)
return all_products_reviews
def __run(self, page, asin):
reviews_found = True
reviews_list = []
ix = 1
while reviews_found:
html = self.__get_html(page, asin, nr=ix)
ix += 1
if html:
reviews = self.__parse_html(html, asin)
reviews_list = reviews_list + reviews
else:
reviews_found = False
break
return reviews_list
def main():
with Reviews() as rv:
asin = "B01N5IB20Q"
result = rv.run_single(asin)
rv.to_csv(result, "result_single.csv")
file_path = "products.csv"
asins = rv.read_csv(file_path)
result = rv.run_multiple(asins)
rv.to_csv(result, "result_multiple.csv")
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment