Created
June 17, 2018 09:15
-
-
Save ahmedengu/fcb42c2abf04d8d636d9fc665e2ce691 to your computer and use it in GitHub Desktop.
web scrape of amazon seller reviews using python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
import re | |
import sys | |
import time | |
import xlrd | |
from selenium import webdriver | |
from selenium.common.exceptions import WebDriverException | |
from selenium.webdriver.support.ui import Select | |
from selenium.webdriver.common.keys import Keys | |
chromeOptions = webdriver.ChromeOptions() | |
prefs = {"profile.managed_default_content_settings.images": 2} | |
chromeOptions.add_experimental_option("prefs", prefs) | |
chromeOptions.add_argument("--headless") | |
driver = webdriver.Chrome(chrome_options=chromeOptions) | |
driver.maximize_window() | |
amznLink1 = "https://www.amazon.com/gp/profile/amzn1.account.AECVUQ2HIJB47HBFZ2PM5GQ5IPVA" | |
driver.get(amznLink1); | |
ProductBlock = driver.find_elements_by_xpath( | |
'//div [@class="desktop card profile-at-card undefined reviews-card"]') # ('//div [@data-story-id]') | |
for p in ProductBlock: | |
ProductName = p.find_element_by_xpath( | |
'//div [@class="a-section profile-at-product-title-container profile-at-product-box-element"]/span').text.encode( | |
'utf-8', 'ignore') | |
asin = re.match("http[s]?://www.amazon.(\w+)(.*)/(dp|gp/product)/(?P<asin>\w+).*", | |
p.find_element_by_xpath('//a [@class="a-link-normal a-text-normal"]').get_attribute("href"), | |
flags=re.IGNORECASE) | |
ProductLink = "https://www.amazon.com/dp/" + str(asin) | |
Stars = p.find_element_by_xpath('//div [@class="a-row profile-at-product-review-stars"]').text.encode('utf-8', | |
'ignore') | |
Star = p.find_element_by_xpath('//span [@class="a-icon-alt"]').get_attribute('innerHTML').split(' ')[0] | |
Comment = p.find_element_by_xpath( | |
'.//p [@class="a-spacing-small a-spacing-top-mini a-color-base"]').text.encode('utf-8', 'ignore') | |
DateofReview = p.find_element_by_xpath('.//span [@class="a-profile-descriptor"]').text.split(' · ')[1] | |
ReviewLink = p.find_element_by_xpath( | |
'.//a [@class="a-link-normal profile-at-review-link a-text-normal"]').get_attribute("href") | |
print(ProductName, | |
asin, | |
ProductLink, | |
Stars, | |
Star, | |
Comment, | |
DateofReview, | |
ReviewLink | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment