Last active
November 16, 2022 03:43
-
-
Save scrapehero/900419a768c5fac9ebdef4cb246b25cb to your computer and use it in GitHub Desktop.
Python 3 code to extract amazon reviews
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
# Written as part of https://www.scrapehero.com/how-to-scrape-amazon-product-reviews-using-python/ | |
from lxml import html | |
from json import dump,loads | |
from requests import get | |
import json | |
from re import sub | |
from dateutil import parser as dateparser | |
from time import sleep | |
def ParseReviews(asin): | |
# This script has only been tested with Amazon.com | |
amazon_url = 'http://www.amazon.com/dp/'+asin | |
# Add some recent user agent to prevent amazon from blocking the request | |
# Find some chrome user agent strings here https://udger.com/resources/ua-list/browser-detail?browser=Chrome | |
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'} | |
for i in range(5): | |
response = get(amazon_url, headers = headers, verify=False, timeout=30) | |
if response.status_code == 404: | |
return {"url": amazon_url, "error": "page not found"} | |
if response.status_code != 200: | |
continue | |
# Removing the null bytes from the response. | |
cleaned_response = response.text.replace('\x00', '') | |
parser = html.fromstring(cleaned_response) | |
XPATH_AGGREGATE = '//span[@id="acrCustomerReviewText"]' | |
XPATH_REVIEW_SECTION_1 = '//div[contains(@id,"reviews-summary")]' | |
XPATH_REVIEW_SECTION_2 = '//div[@data-hook="review"]' | |
XPATH_AGGREGATE_RATING = '//table[@id="histogramTable"]//tr' | |
XPATH_PRODUCT_NAME = '//h1//span[@id="productTitle"]//text()' | |
XPATH_PRODUCT_PRICE = '//span[@id="priceblock_ourprice"]/text()' | |
raw_product_price = parser.xpath(XPATH_PRODUCT_PRICE) | |
raw_product_name = parser.xpath(XPATH_PRODUCT_NAME) | |
total_ratings = parser.xpath(XPATH_AGGREGATE_RATING) | |
reviews = parser.xpath(XPATH_REVIEW_SECTION_1) | |
product_price = ''.join(raw_product_price).replace(',', '') | |
product_name = ''.join(raw_product_name).strip() | |
if not reviews: | |
reviews = parser.xpath(XPATH_REVIEW_SECTION_2) | |
ratings_dict = {} | |
reviews_list = [] | |
# Grabing the rating section in product page | |
for ratings in total_ratings: | |
extracted_rating = ratings.xpath('./td//a//text()') | |
if extracted_rating: | |
rating_key = extracted_rating[0] | |
raw_raing_value = extracted_rating[1] | |
rating_value = raw_raing_value | |
if rating_key: | |
ratings_dict.update({rating_key: rating_value}) | |
# Parsing individual reviews | |
for review in reviews: | |
XPATH_RATING = './/i[@data-hook="review-star-rating"]//text()' | |
XPATH_REVIEW_HEADER = './/a[@data-hook="review-title"]//text()' | |
XPATH_REVIEW_POSTED_DATE = './/span[@data-hook="review-date"]//text()' | |
XPATH_REVIEW_TEXT_1 = './/div[@data-hook="review-collapsed"]//text()' | |
XPATH_REVIEW_TEXT_2 = './/div//span[@data-action="columnbalancing-showfullreview"]/@data-columnbalancing-showfullreview' | |
XPATH_REVIEW_COMMENTS = './/span[@data-hook="review-comment"]//text()' | |
XPATH_AUTHOR = './/span[contains(@class,"profile-name")]//text()' | |
XPATH_REVIEW_TEXT_3 = './/div[contains(@id,"dpReviews")]/div/text()' | |
raw_review_author = review.xpath(XPATH_AUTHOR) | |
raw_review_rating = review.xpath(XPATH_RATING) | |
raw_review_header = review.xpath(XPATH_REVIEW_HEADER) | |
raw_review_posted_date = review.xpath(XPATH_REVIEW_POSTED_DATE) | |
raw_review_text1 = review.xpath(XPATH_REVIEW_TEXT_1) | |
raw_review_text2 = review.xpath(XPATH_REVIEW_TEXT_2) | |
raw_review_text3 = review.xpath(XPATH_REVIEW_TEXT_3) | |
# Cleaning data | |
author = ' '.join(' '.join(raw_review_author).split()) | |
review_rating = ''.join(raw_review_rating).replace('out of 5 stars', '') | |
review_header = ' '.join(' '.join(raw_review_header).split()) | |
try: | |
review_posted_date = dateparser.parse(''.join(raw_review_posted_date)).strftime('%d %b %Y') | |
except: | |
review_posted_date = None | |
review_text = ' '.join(' '.join(raw_review_text1).split()) | |
# Grabbing hidden comments if present | |
if raw_review_text2: | |
json_loaded_review_data = loads(raw_review_text2[0]) | |
json_loaded_review_data_text = json_loaded_review_data['rest'] | |
cleaned_json_loaded_review_data_text = re.sub('<.*?>', '', json_loaded_review_data_text) | |
full_review_text = review_text+cleaned_json_loaded_review_data_text | |
else: | |
full_review_text = review_text | |
if not raw_review_text1: | |
full_review_text = ' '.join(' '.join(raw_review_text3).split()) | |
raw_review_comments = review.xpath(XPATH_REVIEW_COMMENTS) | |
review_comments = ''.join(raw_review_comments) | |
review_comments = sub('[A-Za-z]', '', review_comments).strip() | |
review_dict = { | |
'review_comment_count': review_comments, | |
'review_text': full_review_text, | |
'review_posted_date': review_posted_date, | |
'review_header': review_header, | |
'review_rating': review_rating, | |
'review_author': author | |
} | |
reviews_list.append(review_dict) | |
data = { | |
'ratings': ratings_dict, | |
'reviews': reviews_list, | |
'url': amazon_url, | |
'name': product_name, | |
'price': product_price | |
} | |
return data | |
return {"error": "failed to process the page", "url": amazon_url} | |
def ReadAsin(): | |
# Add your own ASINs here | |
AsinList = ['B01ETPUQ6E', 'B017HW9DEW', 'B00U8KSIOM'] | |
extracted_data = [] | |
for asin in AsinList: | |
print("Downloading and processing page http://www.amazon.com/dp/" + asin) | |
extracted_data.append(ParseReviews(asin)) | |
sleep(5) | |
f = open('data.json', 'w') | |
dump(extracted_data, f, indent=4) | |
f.close() | |
if __name__ == '__main__': | |
ReadAsin() |
amritadey
commented
Jul 13, 2020
via email
Hi
Is this still working?
…On Mon, Jun 17, 2019 at 2:56 PM David Roldan ***@***.***> wrote:
Hi there,
The code has already been updated to get a response in json and csv format;
specifying its output.
Check the repository.
El dom., 16 de jun. de 2019 a la(s) 14:28, amritadey (
***@***.***) escribió:
> @DavidRoldan523 <https://github.com/DavidRoldan523> how do I get the
> review data using this code? Unable to save as csv file or read reviews
>
> —
> You are receiving this because you were mentioned.
> Reply to this email directly, view it on GitHub
> <
https://gist.github.com/900419a768c5fac9ebdef4cb246b25cb?email_source=notifications&email_token=AH7T6GF5GRKX4RSHH3ZIOD3P22H6BA5CNFSM4HYR3C62YY3PNVWWK3TUL52HS4DFVNDWS43UINXW23LFNZ2KUY3PNVWWK3TUL5UWJTQAFTYKQ#gistcomment-2945192
>,
> or mute the thread
> <
https://github.com/notifications/unsubscribe-auth/AH7T6GB4RKO6EG5BA7LS2OTP22H6BANCNFSM4HYR3C6Q
>
> .
>
—
You are receiving this because you commented.
Reply to this email directly, view it on GitHub
<https://gist.github.com/900419a768c5fac9ebdef4cb246b25cb?email_source=notifications&email_token=ALHQEGOHOKFOP6PZFL7TN33P273AXA5CNFSM4HYR3C62YY3PNVWWK3TUL52HS4DFVNDWS43UINXW23LFNZ2KUY3PNVWWK3TUL5UWJTQAFT2SC#gistcomment-2946337>,
or mute the thread
<https://github.com/notifications/unsubscribe-auth/ALHQEGONNK3BZGQ7YSCO7SDP273AXANCNFSM4HYR3C6Q>
.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment