Last active
December 10, 2019 19:49
-
-
Save aydinemre/303349e1fdc8d717f6f5543a23dda98a to your computer and use it in GitHub Desktop.
Scraper for "www.beymen.com/erkek-ayakkabi-bot-10097" page.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
pip install scrapy | |
pip install pandas | |
scrapy runspider scraper/scraper.py -s LOG_ENABLED=False | |
""" | |
import scrapy | |
from scrapy import Selector | |
import re | |
import pandas as pd | |
SUB_PAGE = "erkek-ayakkabi-bot-10097" | |
DOMAIN = "www.beymen.com" | |
URL = 'https://%s/%s/' % (DOMAIN, SUB_PAGE) | |
product_list = set() | |
class BeymenSpider(scrapy.Spider): | |
name = DOMAIN | |
allowed_domains = [DOMAIN] | |
start_urls = [ | |
URL | |
] | |
df = pd.DataFrame(columns=["product_id", "category_id"]) | |
def add_to_df(self, mapping): | |
self.df = self.df.append(mapping, ignore_index=True) | |
def parse_item(self, response): | |
if response.url in product_list: | |
return | |
splitted_url = response.url.split("_") | |
category_id = splitted_url[-1] | |
product_id = splitted_url[-2] | |
product_id_mapping = {"product_id": product_id, | |
"category_id": category_id} | |
self.add_to_df(product_id_mapping) | |
# Run first | |
def parse(self, response): | |
# Collect product links on the page. | |
selector = Selector(response) | |
for href in selector.xpath("//div[@class='item']/a/@href").extract(): | |
if href.startswith("/p_"): | |
product_list.add(href) | |
yield scrapy.Request( | |
response.urljoin(href), | |
callback=self.parse_item | |
) | |
# Traverse next page. | |
PAGE_SELECTOR = ".page-link" | |
for pages in response.css(PAGE_SELECTOR).extract(): | |
if "next" in pages: | |
regex = re.search(" +data-page=\"(.*?)\"", pages) | |
next_page_number = regex.group(1) | |
next_page = "?&page=" + next_page_number | |
yield scrapy.Request( | |
response.urljoin(next_page), | |
callback=self.parse | |
) | |
def closed(self, reason): | |
self.df.to_csv('product_category_mapping.csv', index=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Usage
Install dependencies
To run program: