|
import scrapy |
|
|
|
|
|
class ErrorCheckSpider(scrapy.Spider): |
|
name = 'error-checker' |
|
handle_httpstatus_list = [404, 500, 403] |
|
text_to_search_for = ["PAGE NOT FOUND", "syntax error", "end of file", "access denied", "ACCESS DENIED", "403 forbidden", "403 FORBIDDEN", "undefined variable", "UNDEFINED VARIABLE"] |
|
|
|
def __init__(self, site, *args, **kwargs): |
|
super().__init__(*args, **kwargs) |
|
self.start_urls = [site] |
|
self.DOMAIN = site.split('//')[1] |
|
|
|
|
|
# Add checks for specific text in body of returned page, status code in URL (in case page redirects without throwing error) |
|
# Adds a bunch of false positives, but makes sure everything is caught |
|
def parse(self, response): |
|
if response.status in self.handle_httpstatus_list or any(str(r) in response.url for r in self.handle_httpstatus_list) or any(x in response.text for x in self.text_to_search_for): |
|
item = {} |
|
item['url'] = response.url |
|
# check to see if this is the first URL we're checking; otherwise previous url/page/text will error |
|
if response.url.strip('/') not in self.start_urls: |
|
item['prev_page'] = response.meta['prev_url'] |
|
item['prev_link_url'] = response.meta['prev_href'] |
|
item['prev_link_text'] = response.meta['prev_link_text'] |
|
else: |
|
item['prev_page'] = "First page" |
|
item['prev_link_url'] = "First page" |
|
item['prev_link_text'] = "First page" |
|
item['status'] = response.status |
|
|
|
yield item |
|
|
|
if self.DOMAIN in response.url: |
|
for link in response.css('a'): |
|
href = link.xpath('@href').extract() |
|
text = link.xpath('text()').extract() |
|
text = [x.strip() for x in text] # strip whitespace from link text |
|
|
|
if href: # maybe should show an error if no href |
|
yield response.follow(link, self.parse, meta={ |
|
'prev_link_text': text, |
|
'prev_href': href, |
|
'prev_url': response.url, |
|
}) |