clemfromspace · March 10, 2018 17:02
diff --git a/items.py b/items.py
 import scrapy


 class LinkItem(scrapy.Item):
    href = scrapy.Field()
diff --git a/wikipedia_spider.py b/wikipedia_spider.py
 """Spider for the wikipedia website"""

 from urllib.parse import urljoin

 from scrapy import Request
 from scrapy.linkextractors import LinkExtractor
 from scrapy.spiders import CrawlSpider, Rule

 from ..items import LinkItem


 class WikipediaSpider(CrawlSpider):
    name = 'wikipedia'
    allowed_domains = ['en.wikipedia.org']  # Add other domains maybe ?
    start_urls = [
        'https://en.wikipedia.org/wiki/Main_Page'
    ]

    links = set()

    def filter_duplicate_link(self, link):
        if link not in self.links:
            self.links.add(link)
            return False
        else:
            return True

    def parse(self, response):
        for link in response.xpath('//a/@href').extract():
            # Link do not belong to wikipedia, yield a new Item
            if 'http://' in link or 'https://' in link not in link:
                if not self.filter_duplicate_link(link):
                    yield LinkItem(
                        href=link
                    )
            # Link belong to wikipedia, follow (We don't want the "Special" links and do want the "wiki" links)
            elif 'Special' not in link and 'wiki' in link:
                yield Request(
                    urljoin(response.url, link)
                )
	"""Spider for the wikipedia website"""

	from urllib.parse import urljoin

	from scrapy import Request
	from scrapy.linkextractors import LinkExtractor
	from scrapy.spiders import CrawlSpider, Rule

	from ..items import LinkItem


	class WikipediaSpider(CrawlSpider):
	name = 'wikipedia'
	allowed_domains = ['en.wikipedia.org'] # Add other domains maybe ?
	start_urls = [
	'https://en.wikipedia.org/wiki/Main_Page'
	]

	links = set()

	def filter_duplicate_link(self, link):
	if link not in self.links:
	self.links.add(link)
	return False
	else:
	return True

	def parse(self, response):
	for link in response.xpath('//a/@href').extract():
	# Link do not belong to wikipedia, yield a new Item
	if 'http://' in link or 'https://' in link not in link:
	if not self.filter_duplicate_link(link):
	yield LinkItem(
	href=link
	)
	# Link belong to wikipedia, follow (We don't want the "Special" links and do want the "wiki" links)
	elif 'Special' not in link and 'wiki' in link:
	yield Request(
	urljoin(response.url, link)
	)