Skip to content

Instantly share code, notes, and snippets.

@cpouldev
Created December 27, 2023 07:37
Show Gist options
  • Save cpouldev/af53946576768326088b76e7d9750ca4 to your computer and use it in GitHub Desktop.
Save cpouldev/af53946576768326088b76e7d9750ca4 to your computer and use it in GitHub Desktop.
mymarket
# -*- coding: utf-8 -*-
import re
from scrapy import Request
from src.scraper.shops.spiders.base import SupermarketSpider
BASE_URL = 'https://eshop.mymarket.gr{cat}?sort_by=popularity&items_per_page=120&page={page}'
def format_price(p):
try:
return float(p.strip().replace(' ', '').replace(',', '.').replace('€', ''))
except:
return None
class MyMarketSpider(SupermarketSpider):
name = 'mymarket'
allowed_domains = ['mymarket.gr']
start_urls = ['https://eshop.mymarket.gr/']
def parse(self, response):
cats = response.css('#horizontal-menu > li > a').xpath('@href').getall()
for cat in cats:
yield Request(url=BASE_URL.format(cat=cat, page=0),
callback=self.parse_catalog, meta={'category': cat})
def parse_catalog(self, response):
items = response.css('article.product')
next_page = response.css('.pager__item--next a').xpath('@href').get()
cat = response.meta['category']
if len(items) <= 0:
return None
for item in items:
title = item.css('.product-title::text').get().strip()
offer = item.css('.product-ribbon-override-wrapper span::text').get()
price = format_price(item.css('.price.original-price::text').get())
sale_price = format_price(item.css('.price.final-price::text').get())
only_web = item.css('.only-web-offer')
if only_web:
title = f'{title} (ΜΟΝΟ ONLINE)'
if price and not sale_price:
sale_price = price
if not price and not sale_price:
continue
cost_per_unit = None
fppu = item.css('.product-final-price-per-unit')
if fppu:
fcv = fppu.css('.price-per-unit-value::text').get()
fuv = fppu.css('.price-per-unit-unit .hide-tab::text').get()
if not fuv:
fuv = fppu.css('.price-per-unit-unit::text').get()
cost_per_unit = f'{fcv}/{fuv}'
url = 'https://eshop.mymarket.gr' + item.css('.product-link').xpath('@href').get()
image_url = item.css('.product-image img').xpath('@src').get()
if 'base64' in image_url:
image_url = item.css('.product-image img').xpath('@data-lazyload-src').get()
try:
image_url, _ = image_url.split('?')
except:
image_url = None
image_item, image_hash = self.get_image_item(image_url)
yield image_item
self.insert_item(
key=url,
image=image_hash,
item=title,
price=price,
sale_price=sale_price,
offer=offer,
url=url,
cost_per_unit=cost_per_unit
)
if next_page:
next_page = re.sub(r'(?<=items_per_page=)\d+', '120', next_page)
yield Request(url=f'https://eshop.mymarket.gr{cat}{next_page}', callback=self.parse_catalog,
meta={'category': cat})
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment