python3
and scrapy
(pip install scrapy
)
scrapy runspider -o items.csv -a site="https://yoursite.org" 1spider.py
python3 2format_results.py
import scrapy | |
class BrokenLinksSpider(scrapy.Spider): | |
name = 'brokenlink-checker' | |
handle_httpstatus_list = [404, 500] | |
def __init__(self, site, *args, **kwargs): | |
super().__init__(*args, **kwargs) | |
self.start_urls = [site] | |
self.DOMAIN = site.split('//')[1] | |
def parse(self, response): | |
if response.status in (404, 500): | |
item = {} | |
item['url'] = response.url | |
item['prev_page'] = response.meta['prev_url'] | |
item['prev_link_url'] = response.meta['prev_href'] | |
item['prev_link_text'] = response.meta['prev_link_text'] | |
item['status'] = response.status | |
yield item | |
if self.DOMAIN in response.url: | |
for link in response.css('a'): | |
href = link.xpath('@href').extract() | |
text = link.xpath('text()').extract() | |
if href: # maybe should show an error if no href | |
yield response.follow(link, self.parse, meta={ | |
'prev_link_text': text, | |
'prev_href': href, | |
'prev_url': response.url, | |
}) |
import csv, itertools | |
items = csv.DictReader(open('items.csv')) | |
for page, links in itertools.groupby(items, lambda item: item['prev_page']): | |
if page: | |
print('PAGE:', page) | |
for line in links: | |
print(' LINK TEXT:', line['prev_link_text']) | |
print(' LINK URL:', line['prev_link_url']) | |
print() | |
print() |