Created
December 27, 2023 07:34
-
-
Save cpouldev/e0628908771071f715388abf361f1193 to your computer and use it in GitHub Desktop.
marketin
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
from random import randint | |
from scrapy import Request | |
from w3lib import html | |
from src.scraper.shops.spiders.base import SupermarketSpider | |
def format_price(p): | |
try: | |
return float(p.strip().replace(' ', '').replace(',', '.').replace('€', '')) | |
except: | |
return None | |
class MarketInSpider(SupermarketSpider): | |
name = 'market-in' | |
allowed_domains = ['market-in.gr'] | |
def start_requests(self): | |
yield Request(url='https://www.market-in.gr/el-gr/', callback=self.get_cats, | |
headers={'X-Forwarded-For': f'46.246.{randint(128, 255)}.{randint(0, 255)}'}) | |
def get_cats(self, response): | |
cats = response.css('nav.category-menu > div > ul > li > a').xpath('@href').getall() | |
for cat in cats: | |
yield Request(url=cat, callback=self.parse_catalog, | |
headers={'X-Forwarded-For': f'46.246.{randint(128, 255)}.{randint(0, 255)}'}) | |
def parse_catalog(self, response): | |
items = response.css('.product-item') | |
next_page = response.css('.pagination ul li:last-child') | |
if 'material-icons' in next_page.get(): | |
next_page = next_page.css('a').xpath('@href').get() | |
else: | |
next_page = None | |
for item in items: | |
title = item.css('.product-title a:last-child::text').get() | |
url = item.css('a.product-thumb').xpath('@href').get() | |
cost_per_unit = None | |
price_wrappers = item.css('.product-price') | |
if len(price_wrappers) == 2: | |
cost_per_unit = html.remove_tags(price_wrappers[0].css('span.new-price').get()) \ | |
.strip().replace('/n', '') | |
price = price_wrappers[1].css('span.old-price::text').get() | |
sale_price = price_wrappers[1].css('span.new-price::text').get() | |
else: | |
price = item.css('span.old-price::text').get() | |
sale_price = item.css('span.new-price::text').get() | |
price = format_price(price) | |
sale_price = format_price(sale_price) | |
if price and not sale_price: | |
sale_price = price | |
if not price and not sale_price: | |
continue | |
image_url = item.css('a.product-thumb img').xpath('@src').get() | |
image_url = f'https://market-in.gr{image_url}' | |
image_item, image_hash = self.get_image_item(image_url) | |
yield image_item | |
self.insert_item( | |
key=url, | |
item=title, | |
url=url, | |
price=price, | |
sale_price=sale_price, | |
image=image_hash, | |
cost_per_unit=cost_per_unit, | |
) | |
if next_page: | |
yield Request(url=next_page, callback=self.parse_catalog) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment