Created
November 2, 2017 02:42
-
-
Save aflansburg/353d5e3c24315a6bfa9d77ec726267a9 to your computer and use it in GitHub Desktop.
Scrape an Amazon product review IFrame (returned from the Product Advertising API for a particular ASIN)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from selenium import webdriver | |
# Python 3.6.3 | |
# hold on to your butts | |
driver_path = '/usr/local/bin/chromedriver' | |
url = 'https://www.amazon.com/reviews/iframe?akid=AKIAIOWAH2MM2J3QSNPA&alinkCode=xm2&asin=B006R7AW6M&atag=AssociateTag%3Dsomeutility-20&exp=2017-11-02T21%3A59%3A44Z&v=2&sig=ljBbKJxQiq%252F90us8lfn1uDQ7VmXr%252BDknLvJ49jIsaHU%253D' | |
try: | |
browser = webdriver.Chrome(executable_path=driver_path) | |
browser.get(url) | |
html = browser.page_source | |
# NOTE: product name will have to be gleaned elsewhere (product advertising api?) | |
# grab the element containing the total number of reviews | |
totalReviews = int(str(browser.find_element_by_class_name('tiny').text).replace(' Reviews', '')) | |
# grab the element containing the number of 5 star reviews and parse out everything | |
# except for the percentage and then convert it to a float | |
fiveStarsP = str(browser.find_element_by_class_name('histoRowFive').text) | |
fiveStarsP = fiveStarsP.replace('\n', '').replace('5 star', '').replace('%', '') | |
fiveStarsP = float(float(fiveStarsP)/100) | |
fiveStars = totalReviews * fiveStarsP | |
fiveStars = int(round(fiveStars, 0)) | |
fourStarsP = str(browser.find_element_by_class_name('histoRowFour').text) | |
fourStarsP = fourStarsP.replace('\n', '').replace('4 star', '').replace('%', '') | |
fourStarsP = float(float(fourStarsP)/100) | |
fourStars = totalReviews * fourStarsP | |
fourStars = int(round(fourStars, 0)) | |
threeStarsP = str(browser.find_element_by_class_name('histoRowThree').text) | |
threeStarsP = threeStarsP.replace('\n', '').replace('3 star', '').replace('%', '') | |
threeStarsP = float(float(threeStarsP)/100) | |
threeStars = totalReviews * threeStarsP | |
threeStars = int(round(threeStars, 0)) | |
twoStarsP = str(browser.find_element_by_class_name('histoRowTwo').text) | |
twoStarsP = twoStarsP.replace('\n', '').replace('2 star', '').replace('%', '') | |
twoStarsP = float(float(twoStarsP)/100) | |
twoStars = totalReviews * twoStarsP | |
twoStars = int(round(twoStars, 0)) | |
oneStarP = str(browser.find_element_by_class_name('histoRowOne').text) | |
oneStarP = oneStarP.replace('\n', '').replace('1 star', '').replace('%', '') | |
oneStarP = float(float(oneStarP)/100) | |
oneStar = totalReviews * oneStarP | |
oneStar = int(round(oneStar, 0)) | |
print(f'Total reviews: {totalReviews}\n') | |
print(f'Total number of 1 star reviews: {oneStar} ({int(oneStarP*100)}%)') | |
print(f'Total number of 2 star reviews: {twoStars} ({int(twoStarsP*100)}%)') | |
print(f'Total number of 3 star reviews: {threeStars} ({int(threeStarsP*100)}%)') | |
print(f'Total number of 4 star reviews: {fourStars} ({int(fourStarsP*100)}%)') | |
print(f'Total number of 5 star reviews: {fiveStars} ({int(fiveStarsP*100)}%)') | |
# this will close the browser | |
browser.close() | |
except: | |
print('Something died....') | |
if browser: | |
browser.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Requires chromedriver