aydinemre · December 10, 2019 19:49 · aydinemre · Feb 12, 2019
diff --git a/scraper.py b/scraper.py
 # -*- coding: utf-8 -*-

 """

 pip install scrapy
 pip install pandas
 scrapy runspider scraper/scraper.py -s LOG_ENABLED=False

 """
 import scrapy
 from scrapy import Selector
 import re
 import pandas as pd

 SUB_PAGE = "erkek-ayakkabi-bot-10097"
 DOMAIN = "www.beymen.com"
 URL = 'https://%s/%s/' % (DOMAIN, SUB_PAGE)

 product_list = set()


 class BeymenSpider(scrapy.Spider):
    name = DOMAIN
    allowed_domains = [DOMAIN]
    start_urls = [
        URL
    ]
    df = pd.DataFrame(columns=["product_id", "category_id"])

    def add_to_df(self, mapping):
        self.df = self.df.append(mapping, ignore_index=True)

    def parse_item(self, response):
        if response.url in product_list:
            return

        splitted_url = response.url.split("_")
        category_id = splitted_url[-1]
        product_id = splitted_url[-2]

        product_id_mapping = {"product_id": product_id,
                              "category_id": category_id}
        self.add_to_df(product_id_mapping)
    
    # Run first
    def parse(self, response):
        # Collect product links on the page.
        selector = Selector(response)
        for href in selector.xpath("//div[@class='item']/a/@href").extract():
            if href.startswith("/p_"):
                product_list.add(href)
                yield scrapy.Request(
                    response.urljoin(href),
                    callback=self.parse_item
                )

        # Traverse next page.
        PAGE_SELECTOR = ".page-link"
        for pages in response.css(PAGE_SELECTOR).extract():
            if "next" in pages:
                regex = re.search(" +data-page=\"(.*?)\"", pages)
                next_page_number = regex.group(1)
                next_page = "?&page=" + next_page_number
                yield scrapy.Request(
                    response.urljoin(next_page),
                    callback=self.parse
                )

    def closed(self, reason):
        self.df.to_csv('product_category_mapping.csv', index=False)
	# -- coding: utf-8 --

	"""

	pip install scrapy
	pip install pandas
	scrapy runspider scraper/scraper.py -s LOG_ENABLED=False

	"""
	import scrapy
	from scrapy import Selector
	import re
	import pandas as pd

	SUB_PAGE = "erkek-ayakkabi-bot-10097"
	DOMAIN = "www.beymen.com"
	URL = 'https://%s/%s/' % (DOMAIN, SUB_PAGE)

	product_list = set()


	class BeymenSpider(scrapy.Spider):
	name = DOMAIN
	allowed_domains = [DOMAIN]
	start_urls = [
	URL
	]
	df = pd.DataFrame(columns=["product_id", "category_id"])

	def add_to_df(self, mapping):
	self.df = self.df.append(mapping, ignore_index=True)

	def parse_item(self, response):
	if response.url in product_list:
	return

	splitted_url = response.url.split("_")
	category_id = splitted_url[-1]
	product_id = splitted_url[-2]

	product_id_mapping = {"product_id": product_id,
	"category_id": category_id}
	self.add_to_df(product_id_mapping)

	# Run first
	def parse(self, response):
	# Collect product links on the page.
	selector = Selector(response)
	for href in selector.xpath("//div[@class='item']/a/@href").extract():
	if href.startswith("/p_"):
	product_list.add(href)
	yield scrapy.Request(
	response.urljoin(href),
	callback=self.parse_item
	)

	# Traverse next page.
	PAGE_SELECTOR = ".page-link"
	for pages in response.css(PAGE_SELECTOR).extract():
	if "next" in pages:
	regex = re.search(" +data-page=\"(.*?)\"", pages)
	next_page_number = regex.group(1)
	next_page = "?&page=" + next_page_number
	yield scrapy.Request(
	response.urljoin(next_page),
	callback=self.parse
	)

	def closed(self, reason):
	self.df.to_csv('product_category_mapping.csv', index=False)