Skip to content

Instantly share code, notes, and snippets.

@cpouldev
Created December 27, 2023 07:32
Show Gist options
  • Save cpouldev/3cb40696d9885ec21ac98a565beacbda to your computer and use it in GitHub Desktop.
Save cpouldev/3cb40696d9885ec21ac98a565beacbda to your computer and use it in GitHub Desktop.
Bazaar
# -*- coding: utf-8 -*-
import json
import re
from scrapy import Request
from src.scraper.helpers import format_price
from src.scraper.shops.spiders.base import SupermarketSpider
BASE_URL = 'https://www.bazaar-online.gr/CachedServices/api/Product/GetScrolledProductList?lang=el&templateCode=productList&configCode=&itemId={item_id}&itemPath={item_path}&pageNumber={page}&pageSize=96&sortKey=BestSelling-Descending'
class BazaarSpider(SupermarketSpider):
name = 'bazaar'
allowed_domains = ['bazaar-online.gr']
start_urls = ['https://www.bazaar-online.gr/']
def parse(self, response):
cats = response.css('ul.nav-1 ul.nav-2 li.nav-2-item > a.title').xpath('@href').getall()
for cat in cats:
if not cat or not cat.startswith('/') or cat == '/prosfores':
continue
yield Request(url=f'https://www.bazaar-online.gr{cat}', callback=self.start_pagination)
def start_pagination(self, response):
item_id = re.findall(r"(?<=ItemId:')\d+(?=')", response.body.decode())[0]
item_path = re.findall(r"(?<=ItemPathChecksum:')-?\d+(?=')", response.body.decode())[0]
page = 1
meta = dict(item_id=item_id, item_path=item_path, page=page)
yield Request(url=BASE_URL.format(item_id=item_id, item_path=item_path, page=page),
callback=self.parse_catalog, meta=meta)
def parse_catalog(self, response):
data = json.loads(response.body)
if len(data['data']['Items']) <= 0 or len(data['data']['Items'][0]) <= 0:
return None
for item in data['data']['Items'][0]:
title = item['Title']
url = item['CanonicalUrl']
vendor_id = item['Id']
price = format_price(item['Prices']['Price']['DeletedPrice'])
sale_price = item['Prices']['Price']['CurrentPriceN']
image_url = 'https:' + item['Images'][0]['List'].split('?')[0]
image_item, image_hash = self.get_image_item(image_url)
yield image_item
self.insert_item(
item=title,
key=str(vendor_id),
url=url,
price=price,
sale_price=sale_price,
image=image_hash
)
meta = response.meta
meta['page'] += 1
yield Request(url=BASE_URL.format(item_id=meta['item_id'],
item_path=meta['item_path'],
page=meta['page']),
meta=meta,
callback=self.parse_catalog)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment