Created
April 11, 2013 15:05
-
-
Save fission6/5364129 to your computer and use it in GitHub Desktop.
find broken links contained on page's within a domains sitemap.xml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
usage: python broken_links.py somedomain.com > broken_links.list | |
somedomain.com/sitemap.xml will be grabbed and parsed | |
for each location entry, the page is fetched and all links on that page | |
make a HEAD request. The output is a summary of broken links and what entries or locations | |
contained them from sitemap.xml | |
this offeres a crude but effective way to spot essential pages across a domain. | |
""" | |
import sys | |
import requests | |
from lxml import html | |
LINK_CACHE = {} | |
def fetch_locations_from_sitemap(domain): | |
sitemap_root = html.parse('http://' + domain + '/sitemap.xml') | |
locations = sitemap_root.xpath('//url/loc') | |
return locations | |
def check_pages_links(url, page_root): | |
for element, attribute, link, pos in page_root.iterlinks(): | |
# probably should make all urls absolute, etc. | |
if link.startswith('/') and not link.startswith('//'): | |
# if link is relative then fetch and ensure a 200 | |
link = 'http://' + domain + link | |
if link in LINK_CACHE: | |
LINK_CACHE[link]['pages'].append(url) | |
else: | |
if link.startswith('http'): | |
resp = requests.head(link) | |
LINK_CACHE[link] = { | |
'pages': [url], | |
'status_code': resp.status_code | |
} | |
def broken_link_report(): | |
""" | |
Loop through and show broken links and the pages they are on. | |
""" | |
for link, value in LINK_CACHE.iteritems(): | |
if value['status_code'] != 200: | |
print "** '{0}' is potentially broken ({1})".format(link, value['status_code']) | |
for page in value['pages']: | |
print " - {0}".format(page) | |
def find_broken_links(domain): | |
locations = fetch_locations_from_sitemap(domain) | |
for location in locations: | |
# test each url gives a 200 | |
url = location.text | |
# find broken links for sitemap location | |
page_root = html.parse(url).getroot() | |
check_pages_links(url, page_root) | |
broken_link_report() | |
if __name__ == "__main__": | |
domain = sys.argv[1] | |
print "Finding broken links using the sitemap from the following domain: {0}".format(domain) | |
find_broken_links(domain) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment