paladin3895 · April 11, 2020 08:39
diff --git a/eyewiki.crawler.py b/eyewiki.crawler.py
 import scrapy
 import re

 class Crawler(scrapy.Spider):
    name = 'crawler'
    start_urls = ['https://eyewiki.org/Main_Page']
    base_url = 'https://eyewiki.org/'

    def parse(self, response):
        SET_SELECTOR = '.CategoryTreeSection'
        for category in response.css(SET_SELECTOR):

            NAME_SELECTOR = 'a.CategoryTreeLabel ::text'
            LINK_SELECTOR = 'a.CategoryTreeLabel ::attr(href)'

            categoryObject = {
                'name': category.css(NAME_SELECTOR).extract_first(),
                'link': category.css(LINK_SELECTOR).extract_first(),
            }

            # print(categoryObject)
            yield scrapy.Request(self.base_url + categoryObject['link'], self.parseCategory)

    def parseCategory(self, response):
        SET_SELECTOR = '.mw-category li'
        for article in response.css(SET_SELECTOR):

            NAME_SELECTOR = 'a::attr(title)'
            LINK_SELECTOR = 'a::attr(href)'

            articleObject = {
                'name': article.css(NAME_SELECTOR).extract_first(),
                'link': article.css(LINK_SELECTOR).extract_first(),
            }

            # print(articleObject['name'])
            yield scrapy.Request(self.base_url + articleObject['link'], self.parseArticle)

    def parseArticle(self, response):
        CONTENT_SELECTOR = '#mw-content-text'
        content = response.css(CONTENT_SELECTOR).extract_first()
        sections = re.split(r'(?=<h1>.+</h1>)', content)
        for i in range(1, len(sections)):
            section = sections[i]
            subSections = re.split(r'(?=<h2>.+</h2>)', section)
            for j in range(1, len(subSections)):
                subSection = subSections[j]
                print('================')
                print(subSection)
	import scrapy
	import re

	class Crawler(scrapy.Spider):
	name = 'crawler'
	start_urls = ['https://eyewiki.org/Main_Page']
	base_url = 'https://eyewiki.org/'

	def parse(self, response):
	SET_SELECTOR = '.CategoryTreeSection'
	for category in response.css(SET_SELECTOR):

	NAME_SELECTOR = 'a.CategoryTreeLabel ::text'
	LINK_SELECTOR = 'a.CategoryTreeLabel ::attr(href)'

	categoryObject = {
	'name': category.css(NAME_SELECTOR).extract_first(),
	'link': category.css(LINK_SELECTOR).extract_first(),
	}

	# print(categoryObject)
	yield scrapy.Request(self.base_url + categoryObject['link'], self.parseCategory)

	def parseCategory(self, response):
	SET_SELECTOR = '.mw-category li'
	for article in response.css(SET_SELECTOR):

	NAME_SELECTOR = 'a::attr(title)'
	LINK_SELECTOR = 'a::attr(href)'

	articleObject = {
	'name': article.css(NAME_SELECTOR).extract_first(),
	'link': article.css(LINK_SELECTOR).extract_first(),
	}

	# print(articleObject['name'])
	yield scrapy.Request(self.base_url + articleObject['link'], self.parseArticle)

	def parseArticle(self, response):
	CONTENT_SELECTOR = '#mw-content-text'
	content = response.css(CONTENT_SELECTOR).extract_first()
	sections = re.split(r'(?=<h1>.+</h1>)', content)
	for i in range(1, len(sections)):
	section = sections[i]
	subSections = re.split(r'(?=<h2>.+</h2>)', section)
	for j in range(1, len(subSections)):
	subSection = subSections[j]
	print('================')
	print(subSection)