Skip to content

Instantly share code, notes, and snippets.

@paladin3895
Last active April 11, 2020 08:39
Show Gist options
  • Save paladin3895/d58cd4b34c099f65f3c8129e318b5b04 to your computer and use it in GitHub Desktop.
Save paladin3895/d58cd4b34c099f65f3c8129e318b5b04 to your computer and use it in GitHub Desktop.
eyewiki.crawler.py
import scrapy
import re
class Crawler(scrapy.Spider):
name = 'crawler'
start_urls = ['https://eyewiki.org/Main_Page']
base_url = 'https://eyewiki.org/'
def parse(self, response):
SET_SELECTOR = '.CategoryTreeSection'
for category in response.css(SET_SELECTOR):
NAME_SELECTOR = 'a.CategoryTreeLabel ::text'
LINK_SELECTOR = 'a.CategoryTreeLabel ::attr(href)'
categoryObject = {
'name': category.css(NAME_SELECTOR).extract_first(),
'link': category.css(LINK_SELECTOR).extract_first(),
}
# print(categoryObject)
yield scrapy.Request(self.base_url + categoryObject['link'], self.parseCategory)
def parseCategory(self, response):
SET_SELECTOR = '.mw-category li'
for article in response.css(SET_SELECTOR):
NAME_SELECTOR = 'a::attr(title)'
LINK_SELECTOR = 'a::attr(href)'
articleObject = {
'name': article.css(NAME_SELECTOR).extract_first(),
'link': article.css(LINK_SELECTOR).extract_first(),
}
# print(articleObject['name'])
yield scrapy.Request(self.base_url + articleObject['link'], self.parseArticle)
def parseArticle(self, response):
CONTENT_SELECTOR = '#mw-content-text'
content = response.css(CONTENT_SELECTOR).extract_first()
sections = re.split(r'(?=<h1>.+</h1>)', content)
for i in range(1, len(sections)):
section = sections[i]
subSections = re.split(r'(?=<h2>.+</h2>)', section)
for j in range(1, len(subSections)):
subSection = subSections[j]
print('================')
print(subSection)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment