Created
April 3, 2021 16:33
-
-
Save ischurov/e68761c1034fed90145a5e39ccd45ebb to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import scrapy | |
# запускать из командной строки с помощью команды | |
# scrapy runspider scrapper.py -O results.csv | |
class WikipediaSpider(scrapy.Spider): | |
name = 'wikispider' | |
start_urls = ['https://ru.wikipedia.org/wiki/Премия_Кнута'] | |
def parse(self, response, depth=0): | |
# Тут я контролирую глубину вручную, хотя проще было было | |
# использовать опцию DEPTH_LIMIT = 2 вместо этого — например, | |
# из командной строки: | |
# scrapy runspider scrapper.py --set=DEPTH_LIMIT=2 -O results.csv | |
if depth >= 2: | |
return | |
for next_page in response.css('p > a'): | |
yield {'from': response.css("#firstHeading::text").get(), | |
'to': next_page.attrib['title']} | |
next_class = next_page.xpath("@class").extract() | |
if not (next_class and 'new' in next_class): | |
yield response.follow(next_page, | |
self.parse, | |
cb_kwargs={"depth": depth + 1}) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment