Skip to content

Instantly share code, notes, and snippets.

@cpouldev
Created December 27, 2023 07:35
Show Gist options
  • Save cpouldev/7592587843c36a95a93b48b719b6a784 to your computer and use it in GitHub Desktop.
Save cpouldev/7592587843c36a95a93b48b719b6a784 to your computer and use it in GitHub Desktop.
Sklavenitis
# -*- coding: utf-8 -*-
from random import randint
from scrapy import Request
from w3lib import html
from src.scraper.helpers import format_price
from src.scraper.shops.spiders.base import SupermarketSpider
BASE_URL = 'https://www.sklavenitis.gr{cat}?$component=Atcom.Sites.Yoda.Components.ProductList.Index&sortby=ByPopularity&pg={page}&endless=true'
class SklavenitisSpider(SupermarketSpider):
name = 'sklavenitis'
allowed_domains = ['sklavenitis.gr']
start_urls = ['https://www.sklavenitis.gr/katigories/']
def start_requests(self):
yield Request(url='https://www.sklavenitis.gr/katigories/', headers={
'X-Forwarded-For': f'46.246.{randint(128, 255)}.{randint(0, 255)}'
}, callback=self.parse_init)
def parse_init(self, response):
cats = response.css('aside nav.mainNav a')
for cat in cats:
url = cat.xpath('@href').get()
name = cat.css('::text').get()
if name:
name = name.strip()
yield Request(url=BASE_URL.format(cat=url, page=1),
callback=self.parse_catalog,
headers={'X-Forwarded-For': f'46.246.{randint(128, 255)}.{randint(0, 255)}'},
meta={'page': 1,
'cat_url': url,
'cat_name': name})
def parse_catalog(self, response):
items = response.css('.product')
page = response.meta['page']
if len(items) <= 0:
return None
for item in items:
title = item.css('h4.product__title a::text').get().strip()
cost_per_unit = item.css('.priceKil')
if cost_per_unit:
if cost_per_unit.css('.del'):
cost_per_unit = cost_per_unit.css('.hightlight').get()
else:
cost_per_unit = cost_per_unit.get()
cost_per_unit = html.remove_tags(cost_per_unit).strip().replace('\n', '')
qty = item.css('.itemsEst::text').get()
if qty:
title = f'{title} ({qty})'
offer = item.css('.gift_number::text').get()
image_url = item.css('.product__figure img').xpath('@src').get()
price = format_price(item.css('.main-price .deleted__price::text').get())
sale_price = format_price(item.css('.main-price .price::text').get())
if price and not sale_price:
sale_price = price
if not price and not sale_price:
continue
url = 'https://www.sklavenitis.gr' + item.css('.absLink').xpath('@href').get()
image_item, image_hash = self.get_image_item(image_url)
yield image_item
self.insert_item(
item=title,
key=url,
price=price,
sale_price=sale_price,
offer=offer,
url=url,
image=image_hash,
cost_per_unit=cost_per_unit
)
return Request(url=BASE_URL.format(cat=response.meta['cat_url'], page=page + 1),
callback=self.parse_catalog,
meta={**response.meta, 'page': page + 1})
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment