Last active
March 10, 2018 17:02
-
-
Save clemfromspace/25ad63767d19355398d1 to your computer and use it in GitHub Desktop.
Scrapy wikipédia url
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import scrapy | |
class LinkItem(scrapy.Item): | |
href = scrapy.Field() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Spider for the wikipedia website""" | |
from urllib.parse import urljoin | |
from scrapy import Request | |
from scrapy.linkextractors import LinkExtractor | |
from scrapy.spiders import CrawlSpider, Rule | |
from ..items import LinkItem | |
class WikipediaSpider(CrawlSpider): | |
name = 'wikipedia' | |
allowed_domains = ['en.wikipedia.org'] # Add other domains maybe ? | |
start_urls = [ | |
'https://en.wikipedia.org/wiki/Main_Page' | |
] | |
links = set() | |
def filter_duplicate_link(self, link): | |
if link not in self.links: | |
self.links.add(link) | |
return False | |
else: | |
return True | |
def parse(self, response): | |
for link in response.xpath('//a/@href').extract(): | |
# Link do not belong to wikipedia, yield a new Item | |
if 'http://' in link or 'https://' in link not in link: | |
if not self.filter_duplicate_link(link): | |
yield LinkItem( | |
href=link | |
) | |
# Link belong to wikipedia, follow (We don't want the "Special" links and do want the "wiki" links) | |
elif 'Special' not in link and 'wiki' in link: | |
yield Request( | |
urljoin(response.url, link) | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment