Last active
September 1, 2022 07:47
-
-
Save mpangrazzi/037a718f831dfb325bcb8d2d259c75b5 to your computer and use it in GitHub Desktop.
A very basic Scrapy spider for amazon.com
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from scrapy import Spider | |
from bs4 import BeautifulSoup | |
class AmazonSpider(Spider): | |
name = "amazon" | |
allowed_domains = ["amazon.com"] | |
custom_settings = { | |
"LOG_LEVEL": "INFO", | |
"DOWNLOAD_DELAY": 2, # delay between requests (in seconds), | |
"CLOSESPIDER_ITEMCOUNT": 50, # number of items to scrape | |
"COOKIES_ENABLED": True, | |
"USER_AGENT": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36", | |
} | |
def __init__(self, *args, **kwargs): | |
super(AmazonSpider, self).__init__(*args, **kwargs) | |
self.start_urls = [kwargs.get("start_url")] | |
def get_text(self, el): | |
return BeautifulSoup(el.extract(), "html.parser").get_text( | |
strip=True, | |
separator=" ", | |
) | |
def parse_rating_string(self, text): | |
return int(float(text[:3])) | |
def parse(self, response): | |
data = response.css("#cm_cr-review_list") | |
title = data.css(".review-title") | |
star_ratings = data.css(".review-rating") | |
feedback_text = data.css(".review-text > span") | |
for i, _ in enumerate(star_ratings): | |
yield { | |
"title": self.get_text(title[i]), | |
"rating": self.parse_rating_string(self.get_text(star_ratings[i])), | |
"text": self.get_text(feedback_text[i]), | |
} | |
for next_page in response.css("li.a-last > a"): | |
yield response.follow(next_page, self.parse) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment