Created
September 26, 2018 13:54
-
-
Save markus2120/ba934c68166e0c06c5e0997de5a0d650 to your computer and use it in GitHub Desktop.
Adapted from https://gist.github.com/scrapehero
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from lxml import html | |
from lxml.etree import ParserError | |
import json | |
from time import sleep | |
import argparse | |
import unicodecsv as csv | |
import traceback | |
def parse_offer_details(url): | |
''' | |
Function to parse seller details from amazon offer listing page | |
eg:https://www.amazon.com/gp/offer-listing/ | |
:param url:offer listing url | |
:rtype: seller details as json | |
''' | |
# Add some recent user agent to prevent blocking from amazon | |
headers = { | |
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36' | |
} | |
for retry in range(5): | |
try: | |
print("Downloading and processing page :", url) | |
response = requests.get(url, headers=headers) | |
if response.status_code == 403: | |
raise ValueError("Captcha found. Retrying") | |
response_text = response.text | |
parser = html.fromstring(response_text) | |
base_url = "https://www.amazon.de/" | |
parser.make_links_absolute(base_url) | |
XPATH_PRODUCT_LISTINGS = "//div[contains(@class, 'a-row a-spacing-mini olpOffer')]" | |
# Parsing seller list | |
listings = parser.xpath(XPATH_PRODUCT_LISTINGS) | |
offer_list = [] | |
if not listings: | |
print("no sellers found") | |
return offer_list | |
# parsing individual seller | |
for listing in listings: | |
XPATH_PRODUCT_PRICE = ".//span[contains(@class, 'olpOfferPrice')]//text()" | |
XPATH_PRODUCT_CONDITION = ".//span[contains(@class, 'olpCondition')]//text()" | |
XPATH_PRODUCT_SELLER1 = ".//h3[contains(@class, 'olpSellerName')]//a/text()" | |
XPATH_PRODUCT_SELLER2 = ".//h3[contains(@class, 'olpSellerName')]//img//@alt" | |
product_price = listing.xpath(XPATH_PRODUCT_PRICE) | |
product_price = product_price[0].strip() | |
product_condition = listing.xpath(XPATH_PRODUCT_CONDITION) | |
seller1 = listing.xpath(XPATH_PRODUCT_SELLER1) | |
seller2 = listing.xpath(XPATH_PRODUCT_SELLER2) | |
# cleaning parsed data | |
product_condition = ''.join(''.join(product_condition).split()) if product_condition else None | |
product_seller = ''.join(seller1).strip() if seller1 else ''.join(seller2).strip() | |
offer_details = { | |
'price': product_price, | |
'condition': product_condition, | |
'seller': product_seller, | |
'asin': asin, | |
} | |
print (product_price, product_condition, product_seller,asin) | |
offer_list.append(offer_details) | |
return offer_list | |
except ParserError: | |
print("empty page found") | |
break | |
except: | |
print(traceback.format_exc()) | |
print("retying :", url) | |
if __name__ == '__main__': | |
# defining arguments | |
parser = argparse.ArgumentParser() | |
parser.add_argument('asin', help='unique product id, eg "B01DQ2B8UY"') | |
args = parser.parse_args() | |
asin = args.asin | |
url = 'https://www.amazon.de/gp/offer-listing/'+asin+'/ref=dp_olp_used?ie=UTF8&condition=used' | |
data = parse_offer_details(url) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment