python3 and scrapy (pip install scrapy)
scrapy runspider -o items.csv -a site="https://yoursite.org" 1spider.pypython3 2format_results.py
| import scrapy | |
| class BrokenLinksSpider(scrapy.Spider): | |
| name = 'brokenlink-checker' | |
| handle_httpstatus_list = [404, 500] | |
| def __init__(self, site, *args, **kwargs): | |
| super().__init__(*args, **kwargs) | |
| self.start_urls = [site] | |
| self.DOMAIN = site.split('//')[1] | |
| def parse(self, response): | |
| if response.status in (404, 500): | |
| item = {} | |
| item['url'] = response.url | |
| item['prev_page'] = response.meta['prev_url'] | |
| item['prev_link_url'] = response.meta['prev_href'] | |
| item['prev_link_text'] = response.meta['prev_link_text'] | |
| item['status'] = response.status | |
| yield item | |
| if self.DOMAIN in response.url: | |
| for link in response.css('a'): | |
| href = link.xpath('@href').extract() | |
| text = link.xpath('text()').extract() | |
| if href: # maybe should show an error if no href | |
| yield response.follow(link, self.parse, meta={ | |
| 'prev_link_text': text, | |
| 'prev_href': href, | |
| 'prev_url': response.url, | |
| }) |
| import csv, itertools | |
| items = csv.DictReader(open('items.csv')) | |
| for page, links in itertools.groupby(items, lambda item: item['prev_page']): | |
| if page: | |
| print('PAGE:', page) | |
| for line in links: | |
| print(' LINK TEXT:', line['prev_link_text']) | |
| print(' LINK URL:', line['prev_link_url']) | |
| print() | |
| print() |